├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── beaker_configs
    ├── alpaca_7B.yaml
    ├── alpaca_7B_lora.yaml
    ├── default_eval.yaml
    ├── default_finetune.yaml
    ├── default_finetune_lora_multinode.yaml
    ├── default_finetune_multinode.yaml
    ├── default_finetune_qlora_multinode.yaml
    └── run_weight_diff.sh
├── data
    └── processed
    │   ├── gsm_train
    │       └── make_gsm.py
    │   ├── kaggle_food_recipes
    │       └── make_kfr.py
    │   ├── lima
    │       ├── check_responses.py
    │       ├── randomize_instructions.py
    │       ├── remove_instructions.py
    │       ├── remove_instructions_rephrased.py
    │       └── rephrase_no_restate.py
    │   ├── mbpp
    │       └── make_mbpp.py
    │   ├── pgn
    │       └── make_pgn.py
    │   └── poetry
    │       └── make_poetry.py
├── ds_configs
    ├── stage3_no_offloading.conf
    ├── stage3_no_offloading_accelerate.conf
    ├── stage3_offloading.conf
    └── stage3_offloading_accelerate.conf
├── eval
    ├── alpaca_farm
    │   └── run_eval.py
    ├── bbh
    │   └── run_eval.py
    ├── codex_humaneval
    │   ├── data.py
    │   ├── evaluation.py
    │   ├── execution.py
    │   └── run_eval.py
    ├── dispatch_openai_requests.py
    ├── gsm
    │   ├── examplars.py
    │   └── run_eval.py
    ├── ifeval
    │   ├── instructions.py
    │   ├── instructions_registry.py
    │   ├── instructions_util.py
    │   └── run_eval.py
    ├── mmlu
    │   ├── categories.py
    │   └── run_eval.py
    ├── predict.py
    ├── templates.py
    ├── toxigen
    │   └── run_eval.py
    ├── truthfulqa
    │   ├── configs.py
    │   ├── metrics.py
    │   ├── presets.py
    │   ├── run_eval.py
    │   └── utilities.py
    ├── tydiqa
    │   └── run_eval.py
    ├── utils.py
    ├── val_eval
    │   ├── make_ref.py
    │   ├── run_eval.py
    │   ├── val-gpt-3.5-turbo-ref.json
    │   ├── val-gpt3.5-2.json
    │   ├── val-gpt3.5.json
    │   └── val.jsonl
    └── xstest
    │   ├── classify_refusal.py
    │   └── run_eval.py
├── human_eval
    ├── README.md
    ├── app.py
    ├── compute_metrics.py
    ├── data
    │   ├── eval_annotations_tulu_1.xlsx
    │   └── eval_instances_tulu_1.jsonl
    ├── export_db.py
    ├── requirements.txt
    ├── screenshot.png
    ├── static
    │   ├── app.js
    │   ├── favicon.png
    │   └── styles.css
    └── templates
    │   ├── index.html
    │   └── login.html
├── images
    ├── fig1.png
    └── tulu_logo.png
├── model_licenses
    ├── llama_license.txt
    ├── opt_license.txt
    ├── pythia_license.txt
    └── tulu_license.txt
├── new_lima
    ├── check_responses.py
    ├── lima_all.jsonl
    ├── lima_both.jsonl
    ├── lima_checks.jsonl
    ├── lima_data.jsonl
    ├── lima_no_instruction.jsonl
    ├── lima_no_instruction_plus_refusal.jsonl
    ├── lima_no_instruction_plus_refusal_and_qualities.jsonl
    ├── lima_no_instruction_plus_refusal_and_rejection.jsonl
    ├── lima_no_instruction_plus_refusal_no_instructions.jsonl
    ├── lima_no_instruction_rephrased.jsonl
    ├── lima_no_responses.jsonl
    ├── lima_noins_plus_partial.jsonl
    ├── lima_plus_refusal.jsonl
    ├── lima_random_instruction_10_epoch.jsonl
    ├── lima_rephrased.jsonl
    ├── make_partial_data.py
    ├── partial_spec.jsonl
    ├── partial_spec2.jsonl
    ├── partial_spec_no_instructions.jsonl
    ├── pretrain.jsonl
    ├── qualities.jsonl
    ├── qualities_partial.jsonl
    ├── randomize_instructions.py
    ├── refusal_selective.jsonl
    ├── remove_instructions.py
    ├── remove_instructions_partial.py
    ├── remove_instructions_rephrased.py
    ├── remove_responses.py
    └── rephrase_no_restate.py
├── open_instruct
    ├── combined_model.py
    ├── dpo_tune.py
    ├── dpo_utils.py
    ├── finetune.py
    ├── finetune_trainer.py
    ├── get_statistics.py
    ├── gradio_demo.py
    ├── gradio_demo_chat.py
    ├── instruction_encode_templates.py
    ├── merge_lora.py
    ├── plot_embeds.py
    ├── ratio_eval.py
    ├── reformat_datasets.py
    ├── run_interact.py
    ├── safe_save_trainer.py
    └── utils.py
├── plot_ratios.py
├── print_one_example.py
├── quantize
    ├── README.md
    ├── experiments
    │   └── gptq_compress_llama_7b.py
    ├── quantize_autogptq_wikitext.py
    └── scripts
    │   └── eval_on_mmlu.sh
├── requirements.txt
├── scripts
    ├── collect_eval_results.py
    ├── convert_llama_weights_to_hf.sh
    ├── do_preference_scripts.sh
    ├── dpo_train_with_accelerate.sh
    ├── dpo_train_with_qlora.sh
    ├── dummy_length_scorer.py
    ├── eval
    │   ├── alpaca_farm.sh
    │   ├── alpaca_farm2.sh
    │   ├── bbh.sh
    │   ├── bbh2.sh
    │   ├── codex_humaneval.sh
    │   ├── codex_humaneval2.sh
    │   ├── eval.sh
    │   ├── gsm.sh
    │   ├── gsm2.sh
    │   ├── ifeval.sh
    │   ├── ifeval2.sh
    │   ├── mmlu.sh
    │   ├── mmlu2.sh
    │   ├── toxigen.sh
    │   ├── trutufulqa.sh
    │   ├── trutufulqa2.sh
    │   ├── tydiqa.sh
    │   ├── tydiqa2.sh
    │   └── xstest.sh
    ├── finetune_lora_with_accelerate.sh
    ├── finetune_qlora_with_accelerate.sh
    ├── finetune_with_accelerate.sh
    ├── finetune_with_hf_trainer.sh
    ├── get_statistics.sh
    ├── iclr2025
    │   ├── expt1_llama
    │   │   ├── ins
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   ├── res
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   └── run_expt_val.sh
    │   ├── expt1_olmo
    │   │   ├── ins
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   ├── res
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   └── run_expt_val.sh
    │   ├── expt2_llama
    │   │   ├── gsm
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   ├── mbpp
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   ├── pgn
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   ├── poetry
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   └── recipe
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   ├── expt2_olmo
    │   │   ├── gsm
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_2e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   ├── mbpp
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   ├── pgn
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   ├── poetry
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   └── recipe
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   ├── expt3_llama
    │   │   └── finetune_gsm_checkpoint.sh
    │   ├── no_rephrase_expt
    │   │   ├── ins
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   ├── res
    │   │   │   ├── finetune_10e.sh
    │   │   │   ├── finetune_15e.sh
    │   │   │   ├── finetune_20e.sh
    │   │   │   ├── finetune_5e.sh
    │   │   │   ├── finetune_7e.sh
    │   │   │   ├── finetune_seed1.sh
    │   │   │   ├── finetune_seed2.sh
    │   │   │   ├── finetune_seed3.sh
    │   │   │   ├── finetune_seed4.sh
    │   │   │   └── finetune_seed5.sh
    │   │   └── run_expt_val.sh
    │   └── other_tags_expt
    │   │   ├── ins
    │   │       ├── finetune_10e.sh
    │   │       ├── finetune_15e.sh
    │   │       ├── finetune_20e.sh
    │   │       ├── finetune_5e.sh
    │   │       ├── finetune_7e.sh
    │   │       ├── finetune_seed1.sh
    │   │       ├── finetune_seed2.sh
    │   │       ├── finetune_seed3.sh
    │   │       ├── finetune_seed4.sh
    │   │       └── finetune_seed5.sh
    │   │   ├── res
    │   │       ├── finetune_10e.sh
    │   │       ├── finetune_15e.sh
    │   │       ├── finetune_20e.sh
    │   │       ├── finetune_5e.sh
    │   │       ├── finetune_7e.sh
    │   │       ├── finetune_seed1.sh
    │   │       ├── finetune_seed2.sh
    │   │       ├── finetune_seed3.sh
    │   │       ├── finetune_seed4.sh
    │   │       └── finetune_seed5.sh
    │   │   └── run_expt_val.sh
    ├── olmo
    │   ├── finetune_no_ins_try7.sh
    │   ├── finetune_noins_plus_partial_try6.sh
    │   └── finetune_try4.sh
    ├── prepare_eval_data.sh
    ├── prepare_science_data.py
    ├── prepare_train_data.sh
    ├── resample_flan_v2.py
    ├── split_sharegpt_conversations.py
    ├── submit_eval_jobs.py
    ├── submit_finetune_jobs.py
    ├── sweep
    │   ├── 13B
    │   │   ├── finetune_13b_10epoch.sh
    │   │   ├── finetune_13b_20epoch.sh
    │   │   ├── finetune_13b_30epoch.sh
    │   │   ├── finetune_13b_7epoch.sh
    │   │   ├── finetune_13b_noins_10epoch.sh
    │   │   ├── finetune_13b_noins_20epoch.sh
    │   │   ├── finetune_13b_noins_30epoch.sh
    │   │   ├── finetune_13b_noins_7epoch.sh
    │   │   ├── finetune_13b_noins_plus_partial_10epoch.sh
    │   │   ├── finetune_13b_noins_plus_partial_20epoch.sh
    │   │   ├── finetune_13b_noins_plus_partial_30epoch.sh
    │   │   └── finetune_13b_noins_plus_partial_7epoch.sh
    │   ├── 1B
    │   │   ├── finetune_13b_10epoch.sh
    │   │   ├── finetune_13b_20epoch.sh
    │   │   ├── finetune_13b_30epoch.sh
    │   │   ├── finetune_13b_7epoch.sh
    │   │   ├── finetune_13b_noins_10epoch.sh
    │   │   ├── finetune_13b_noins_20epoch.sh
    │   │   ├── finetune_13b_noins_30epoch.sh
    │   │   ├── finetune_13b_noins_7epoch.sh
    │   │   ├── finetune_13b_noins_plus_partial_10epoch.sh
    │   │   ├── finetune_13b_noins_plus_partial_20epoch.sh
    │   │   ├── finetune_13b_noins_plus_partial_30epoch.sh
    │   │   └── finetune_13b_noins_plus_partial_7epoch.sh
    │   └── 7B
    │   │   ├── finetune_no_ins_try2.sh
    │   │   ├── finetune_no_ins_try3.sh
    │   │   ├── finetune_no_ins_try4.sh
    │   │   ├── finetune_no_ins_try5.sh
    │   │   ├── finetune_no_ins_try6.sh
    │   │   ├── finetune_no_ins_try7.sh
    │   │   ├── finetune_noins_plus_partial_try2.sh
    │   │   ├── finetune_noins_plus_partial_try3.sh
    │   │   ├── finetune_noins_plus_partial_try4.sh
    │   │   ├── finetune_noins_plus_partial_try5.sh
    │   │   ├── finetune_noins_plus_partial_try6.sh
    │   │   ├── finetune_noins_plus_partial_try7.sh
    │   │   ├── finetune_try2.sh
    │   │   ├── finetune_try3.sh
    │   │   ├── finetune_try4.sh
    │   │   ├── finetune_try5.sh
    │   │   ├── finetune_try6.sh
    │   │   └── finetune_try7.sh
    ├── weight_diff.py
    └── weird
    │   ├── finetune_gsm_baseline.sh
    │   ├── finetune_gsm_noins.sh
    │   ├── finetune_mbpp_baseline.sh
    │   ├── finetune_mbpp_baseline2.sh
    │   ├── finetune_mbpp_noins.sh
    │   ├── finetune_mbpp_plus_refusal.sh
    │   ├── finetune_mbpp_plus_refusal2.sh
    │   ├── finetune_pgn_baseline.sh
    │   ├── finetune_pgn_noins.sh
    │   ├── finetune_poetry_baseline.sh
    │   ├── finetune_poetry_noins.sh
    │   ├── finetune_recipe_baseline.sh
    │   └── finetune_recipe_noins.sh
└── weight-diff-requirements.txt


/beaker_configs/alpaca_7B.yaml:
--------------------------------------------------------------------------------
 1 | version: v2
 2 | description: open-instruct-alpaca-7B
 3 | tasks:
 4 |   - name: open-instruct-alpaca-7B
 5 |     image:
 6 |       beaker: Yizhongw03/open-instruct
 7 |     command: [
 8 |       '/bin/sh', '-c'
 9 |     ]
10 |     arguments: ['deepspeed
11 |       open_instruct/finetune_trainer.py
12 |       --deepspeed ds_configs/stage3_no_offloading.conf
13 |       --model_name_or_path /hf_llama_models/
14 |       --tokenizer_name /hf_llama_models/
15 |       --use_fast_tokenizer False
16 |       --train_file /data/alpaca_data_original_template.jsonl
17 |       --max_seq_length 512
18 |       --per_device_train_batch_size 4
19 |       --gradient_accumulation_steps 8
20 |       --num_train_epochs 3
21 |       --do_train
22 |       --learning_rate 2e-5
23 |       --lr_scheduler_type linear
24 |       --warmup_ratio 0.03
25 |       --weight_decay 0.
26 |       --evaluation_strategy "no"
27 |       --logging_steps 1
28 |       --save_strategy epoch
29 |       --save_total_limit 1
30 |       --output_dir /output/
31 |       --bf16
32 |       --tf32 True
33 |       --overwrite_output_dir
34 |     ']
35 |     envVars:
36 |       - name: CUDA_DEVICE_ORDER
37 |         value: PCI_BUS_ID
38 |       - name: TRANSFORMERS_CACHE
39 |         value: ./cache/
40 |       - name: WANDB_PROJECT
41 |         value: open-instruct
42 |       - name: WANDB_WATCH
43 |         value: false
44 |       - name: WANDB_LOG_MODEL
45 |         value: false
46 |       - name: WANDB_DISABLED
47 |         value: true
48 |     datasets:
49 |       - mountPath: /data
50 |         source:
51 |           beaker: Yizhongw03/processed_open_instruct_data
52 |       - mountPath: /hf_llama_models
53 |         source:
54 |           beaker: Yizhongw03/hf_llama_model_7B
55 |     result:
56 |       # Beaker will capture anything that's written to this location and store it in the results
57 |       # dataset.
58 |       path: /output
59 |     resources:
60 |       gpuCount: 4
61 |     context:
62 |       cluster: ai2/allennlp-cirrascale
63 |       priority: high


--------------------------------------------------------------------------------
/beaker_configs/default_eval.yaml:
--------------------------------------------------------------------------------
 1 | version: v2
 2 | description: open-instruct-eval-default
 3 | budget: ai2/oe-adapt
 4 | tasks:
 5 |   - name: open-instruct-eval-default
 6 |     image:
 7 |       beaker: Yizhongw03/open-instruct
 8 |     command: [
 9 |       '/bin/sh', '-c'
10 |     ]
11 |     arguments: ['python -m eval.mmlu.run_eval
12 |       --ntrain 5
13 |       --data_dir /data/mmlu/
14 |       --save_dir /output/
15 |       --model_name_or_path /model
16 |       --tokenizer_name_or_path /model
17 |       --eval_batch_size 4
18 |       --load_in_8bit
19 |       --use_chat_format
20 |       --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
21 |     ']
22 |     envVars:
23 |       - name: CUDA_DEVICE_ORDER
24 |         value: PCI_BUS_ID
25 |       - name: TRANSFORMERS_CACHE
26 |         value: ./cache/
27 |       - name: WANDB_PROJECT
28 |         value: open-instruct
29 |       - name: WANDB_WATCH
30 |         value: false
31 |       - name: WANDB_LOG_MODEL
32 |         value: false
33 |       - name: WANDB_DISABLED
34 |         value: true
35 |       - name: OPENAI_API_KEY
36 |         secret: openai_api_key
37 |     datasets:
38 |       - mountPath: /data/
39 |         source:
40 |           beaker: Yizhongw03/open_instruct_eval_data
41 |       - mountPath: /model
42 |         source:
43 |           beaker: 01GVYXDGJC6DV0JW9JZ16YM07G
44 |       - mountPath: /net/nfs.cirrascale
45 |         source:
46 |           hostPath: /net/nfs.cirrascale
47 |     result:
48 |       # Beaker will capture anything that's written to this location and store it in the results
49 |       # dataset.
50 |       path: /output
51 |     resources:
52 |       gpuCount: 1
53 |     constraints:
54 |       cluster: ai2/allennlp-cirrascale
55 |     context:
56 |       priority: high
57 |       preemptible: false


--------------------------------------------------------------------------------
/beaker_configs/default_finetune.yaml:
--------------------------------------------------------------------------------
 1 | version: v2
 2 | description: open-instruct-finetune
 3 | tasks:
 4 |   - name: open-instruct-finetune
 5 |     image:
 6 |       beaker: Yizhongw03/open-instruct
 7 |     command: [
 8 |       '/bin/sh', '-c'
 9 |     ]
10 |     arguments: ['accelerate launch
11 |       --mixed_precision bf16
12 |       --num_machines 1
13 |       --num_processes 4
14 |       --use_deepspeed
15 |       --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf
16 |       open_instruct/finetune.py
17 |       --model_name_or_path /hf_llama_models
18 |       --use_flash_attn
19 |       --tokenizer_name /hf_llama_models
20 |       --use_slow_tokenizer
21 |       --train_file /data/alpaca_data_original_template.jsonl
22 |       --max_seq_length 2048
23 |       --preprocessing_num_workers 16
24 |       --per_device_train_batch_size 2
25 |       --gradient_accumulation_steps 16
26 |       --learning_rate 2e-5
27 |       --lr_scheduler_type linear
28 |       --warmup_ratio 0.03
29 |       --weight_decay 0.
30 |       --num_train_epochs 2
31 |       --output_dir /output/
32 |       --with_tracking
33 |       --report_to tensorboard
34 |       --logging_steps 1
35 |     ']
36 |     envVars:
37 |       - name: CUDA_DEVICE_ORDER
38 |         value: PCI_BUS_ID
39 |       - name: TRANSFORMERS_CACHE
40 |         value: ./cache/
41 |       - name: WANDB_PROJECT
42 |         value: open-instruct
43 |       - name: WANDB_WATCH
44 |         value: false
45 |       - name: WANDB_LOG_MODEL
46 |         value: false
47 |       - name: WANDB_DISABLED
48 |         value: true
49 |     datasets:
50 |       - mountPath: /data
51 |         source:
52 |           beaker: Yizhongw03/processed_open_instruct_data
53 |       - mountPath: /mmlu
54 |         source:
55 |           beaker: Yizhongw03/mmlu
56 |       - mountPath: /hf_llama_models
57 |         source:
58 |           beaker: Yizhongw03/hf_llama_model_7B
59 |     result:
60 |       path: /output
61 |     resources:
62 |       gpuCount: 4
63 |     context:
64 |       cluster: ai2/allennlp-cirrascale
65 |       priority: high
66 |       preemptible: false


--------------------------------------------------------------------------------
/beaker_configs/run_weight_diff.sh:
--------------------------------------------------------------------------------
1 | RAW_MODEL_PATH=$1
2 | model_size=$2
3 | og_name=$3
4 | 
5 | python scripts/weight_diff.py make_diff --path_raw ${RAW_MODEL_PATH}/${model_size} --path_tuned /model --path_diff /results/${og_name}-diff 
6 | python scripts/weight_diff.py recover --path_raw ${RAW_MODEL_PATH}/${model_size} --path_tuned test_recover --path_diff  /results/${og_name}-diff --original_model /model


--------------------------------------------------------------------------------
/data/processed/gsm_train/make_gsm.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import json
 3 | 
 4 | dataset = datasets.load_dataset('openai/gsm8k', 'main')['train']
 5 | 
 6 | fout = open('gsm8k.jsonl', 'w')
 7 | 
 8 | for i, elt in enumerate(dataset):
 9 |   if i == 1000:
10 |     break
11 |   record = {}
12 |   record['dataset'] = 'gsm8k'
13 |   record['id'] = 'gsm8k_{}'.format(i)
14 |   messages = [
15 |       {"role": "user", "content": elt['question']},
16 |       {"role": "assistant", "content": elt['answer']},
17 |   ]
18 |   record['messages'] = messages
19 |   fout.write(json.dumps(record)+'\n')
20 | 


--------------------------------------------------------------------------------
/data/processed/kaggle_food_recipes/make_kfr.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import json
 3 | 
 4 | dataset = datasets.load_dataset('Hieu-Pham/kaggle_food_recipes')['train']
 5 | 
 6 | fout = open('kfr.jsonl', 'w')
 7 | 
 8 | for i, elt in enumerate(dataset):
 9 |   if i == 1000:
10 |     break
11 |   record = {}
12 |   record['dataset'] = 'kfr'
13 |   record['id'] = 'kfr_{}'.format(i)
14 |   ingredients = ''.join([' - ' + x + '\n' for x in eval(elt['Cleaned_Ingredients'])])
15 |   recipe = ingredients + '\n\n' + elt['Instructions']
16 |   messages = [
17 |       {"role": "user", "content": 'Recipe for ' + elt['Title']},
18 |       {"role": "assistant", "content": recipe}
19 |   ]
20 |   record['messages'] = messages
21 |   fout.write(json.dumps(record)+'\n')
22 | 


--------------------------------------------------------------------------------
/data/processed/lima/check_responses.py:
--------------------------------------------------------------------------------
 1 | """Checks whether LIMA responses start by repeating the question."""
 2 | import json
 3 | import os
 4 | from openai import OpenAI
 5 | from collections import Counter
 6 | from tqdm import tqdm
 7 | 
 8 | client = OpenAI()
 9 | 
10 | def get_prompt(instruction, answer):
11 |   prompt = """Below is a pair of an instruction and a response. Your job is to tell if the response starts by rephrasing the instruction.
12 |   If the response starts by rephrasing the instruction, e.g., "Give me a recipe for Tiramisu" -- "Sure; here's a recipe for tiramisu:"
13 |   then output #### YES. Otherwise output #### NO.
14 | 
15 |   Instruction: {}
16 | 
17 |   Response: {}
18 |   """.format(instruction, answer)
19 |   return prompt
20 | 
21 | def fetch_response(prompt):
22 |     # Replace 'your_api_key_here' with your actual OpenAI API key
23 |     #openai.api_key = 'your_api_key_here'
24 |     response = client.chat.completions.create(
25 |         model="gpt-4-turbo",
26 |         messages =[{'role': 'user', 'content': prompt}],
27 |         max_tokens=150
28 |     )
29 |     print(response.choices[0].message.content)
30 |     return response.choices[0].message.content
31 | 
32 | c = Counter()
33 | 
34 | if os.path.exists('lima_checks.jsonl'):
35 |   with open('lima_checks.jsonl') as fin:
36 |     done_examples = [json.loads(x) for x in fin]
37 | else:
38 |   done_examples = []
39 | 
40 | with open('lima_checks.jsonl', 'a') as fout:
41 |     for i, line in tqdm(enumerate(open('lima_data.jsonl'))):
42 |         if i <= len(done_examples):
43 |           continue
44 |         line = json.loads(line)
45 |         messages = line['messages']
46 |         for message, next_message in zip(messages, messages[1:]):
47 |             if next_message['role'] == 'assistant':
48 |                 instruction = message['content']
49 |                 answer = next_message['content']
50 |                 message['check'] = fetch_response(get_prompt(instruction, answer))
51 |         fout.write(json.dumps(line) + '\n')
52 | 


--------------------------------------------------------------------------------
/data/processed/lima/randomize_instructions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Takes an instruction-tuning dataset and makes an epoch-fixed randomized instruction dataset
 3 | """
 4 | import json
 5 | from collections import Counter
 6 | import random
 7 | 
 8 | 
 9 | c = Counter()
10 | 
11 | random.seed(89)
12 | 
13 | EPOCHS = 10
14 | 
15 | 
16 | # Gather all instructions
17 | instructions = []
18 | for line in open('lima_data.jsonl'):
19 |   line = json.loads(line)
20 |   messages = line['messages']
21 |   for message in messages:
22 |     if message['role'] == 'user':
23 |       instructions.append(message['content'])
24 | 
25 | with open('lima_random_instruction_{}_epoch.jsonl'.format(EPOCHS), 'w') as fout:
26 |   # once per epoch so over 1 "epoch" in the training loop we see a unique shuffle
27 |   for i in range(EPOCHS):
28 |     random.shuffle(instructions)
29 |     ins_iter = iter(instructions)
30 |     for line in open('lima_data.jsonl'):
31 |       line = json.loads(line)
32 |       messages = line['messages']
33 |       for message in messages:
34 |         if message['role'] == 'user':
35 |           message['content'] = next(ins_iter)
36 |       fout.write(json.dumps(line) + '\n')
37 | 


--------------------------------------------------------------------------------
/data/processed/lima/remove_instructions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Takes an instruction-tuning dataset and makes (1) an empty instruction and (2) the last 
 3 | """
 4 | import json
 5 | from collections import Counter
 6 | c = Counter()
 7 | with open('lima_no_instruction.jsonl', 'w') as fout:
 8 |   for line in open('lima_data.jsonl'):
 9 |     line = json.loads(line)
10 |     messages = line['messages']
11 |     for message in messages:
12 |       if message['role'] == 'user':
13 |         message['content'] = ''
14 |     fout.write(json.dumps(line) + '\n')
15 | 


--------------------------------------------------------------------------------
/data/processed/lima/remove_instructions_rephrased.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Takes an instruction-tuning dataset and makes (1) an empty instruction and (2) the last 
 3 | """
 4 | import json
 5 | from collections import Counter
 6 | c = Counter()
 7 | with open('lima_no_instruction_rephrased.jsonl', 'w') as fout:
 8 |   for line in open('lima_rephrased.jsonl'):
 9 |     line = json.loads(line)
10 |     messages = line['messages']
11 |     for message in messages:
12 |       if message['role'] == 'user':
13 |         message['content'] = ''
14 |     fout.write(json.dumps(line) + '\n')
15 | 


--------------------------------------------------------------------------------
/data/processed/lima/rephrase_no_restate.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from openai import OpenAI
 4 | from collections import Counter
 5 | from tqdm import tqdm
 6 | 
 7 | client = OpenAI()
 8 | 
 9 | def get_prompt(instruction, answer):
10 |   prompt = """Below is a pair of an instruction and a response. The response starts by rephrasing the instruction.
11 |   Your job is to regenerate the response with the rephrasing of the instruction removed.
12 |   E.g., for instruction "Give me a recipe for Tiramisu" a rephrasing is "Sure; here's a recipe for tiramisu:"
13 |   You remove the rephrasing, and generate just the rest of the response.
14 | 
15 |   Instruction: {}
16 | 
17 |   Response: {}
18 |   """.format(instruction, answer)
19 |   return prompt
20 | 
21 | def fetch_response(prompt):
22 |     # Replace 'your_api_key_here' with your actual OpenAI API key
23 |     #openai.api_key = 'your_api_key_here'
24 |     response = client.chat.completions.create(
25 |         model="gpt-4-turbo",
26 |         messages =[{'role': 'user', 'content': prompt}],
27 |         max_tokens=2000
28 |     )
29 |     print(response.choices[0].message.content)
30 |     return response.choices[0].message.content
31 | 
32 | c = Counter()
33 | 
34 | #if os.path.exists('lima_checks.jsonl'):
35 | #  with open('lima_checks.jsonl') as fin:
36 | #    done_examples = [json.loads(x) for x in fin]
37 | #else:
38 | #  done_examples = []
39 | 
40 | with open('lima_rephrased.jsonl', 'w') as fout:
41 |     for i, line in tqdm(enumerate(open('lima_checks.jsonl'))):
42 |         #if i <= len(done_examples):
43 |         #  continue
44 |         line = json.loads(line)
45 |         messages = line['messages']
46 |         for message, next_message in zip(messages, messages[1:]):
47 |             if next_message['role'] == 'assistant':
48 |                 if message['check'].strip() == '#### YES':
49 |                   instruction = message['content']
50 |                   answer = next_message['content']
51 |                   next_message['content_old'] = answer
52 |                   next_message['content'] = fetch_response(get_prompt(instruction, answer))
53 |         fout.write(json.dumps(line) + '\n')
54 | 


--------------------------------------------------------------------------------
/data/processed/mbpp/make_mbpp.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import json
 3 | 
 4 | dataset = datasets.load_dataset('google-research-datasets/mbpp')['train']
 5 | 
 6 | fout = open('mbpp.jsonl', 'w')
 7 | 
 8 | for i, elt in enumerate(dataset):
 9 |   if i == 1000:
10 |     break
11 |   record = {}
12 |   record['dataset'] = 'mbpp'
13 |   record['id'] = 'mbpp_{}'.format(i)
14 |   messages = [
15 |       {"role": "user", "content": elt['text']},
16 |       {"role": "assistant", "content": elt['code']}
17 |   ]
18 |   record['messages'] = messages
19 |   fout.write(json.dumps(record)+'\n')
20 | 


--------------------------------------------------------------------------------
/data/processed/pgn/make_pgn.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import json
 3 | from tqdm import tqdm
 4 | 
 5 | dataset = datasets.load_dataset('patrickfrank1/chess-pgn-games')['train']
 6 | 
 7 | 
 8 | fout = open('pgn.jsonl', 'w')
 9 | 
10 | buf = [None, None, None]
11 | count = 0
12 | elements_written = 0
13 | for i, elt in tqdm(enumerate(dataset)):
14 |   record = {}
15 |   record['dataset'] = 'pgn-patrickfrank1'
16 |   record['id'] = 'pgn-patrickfrank1-{}'.format(i)
17 |   if 'WhiteElo' in  elt['text']:
18 |     buf[0] = elt['text']
19 |     count += 1
20 |   elif 'BlackElo' in elt['text']:
21 |     buf[1] = elt['text']
22 |     count += 1
23 |   elif elt['text'].startswith('1. '):
24 |     buf[2] = elt['text']
25 |     assert count == 2
26 |     count = 0
27 |     messages = [
28 |         {"role": "user", "content": buf[0] + '\n'+ buf[1] + '\n'},
29 |         {"role": "assistant", "content": buf[2]}
30 |     ]
31 |     record['messages'] = messages
32 |     fout.write(json.dumps(record)+'\n')
33 |     elements_written += 1
34 |     if elements_written == 1000:
35 |       break
36 | 


--------------------------------------------------------------------------------
/data/processed/poetry/make_poetry.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import json
 3 | 
 4 | dataset = datasets.load_dataset('merve/poetry')['train']
 5 | 
 6 | fout = open('poetry.jsonl', 'w')
 7 | 
 8 | for i, elt in enumerate(dataset):
 9 |   if i == 1000:
10 |     break
11 |   record = {}
12 |   record['dataset'] = 'merve_poetry'
13 |   record['id'] = 'merve_poetry_{}'.format(i)
14 |   if elt['poem name'] is None:
15 |     continue
16 |   messages = [
17 |       {"role": "user", "content": 'Write a poem called ' + elt['poem name']},
18 |       {"role": "assistant", "content": elt['content']}
19 |   ]
20 |   record['messages'] = messages
21 |   fout.write(json.dumps(record)+'\n')
22 | 


--------------------------------------------------------------------------------
/ds_configs/stage3_no_offloading.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": "auto"
 4 |     },
 5 |     "optimizer": {
 6 |         "type": "AdamW",
 7 |         "params": {
 8 |             "lr": "auto",
 9 |             "betas": "auto",
10 |             "eps": "auto",
11 |             "weight_decay": "auto"
12 |         }
13 |     },
14 |     "scheduler": {
15 |        "type": "WarmupDecayLR",
16 |        "params": {
17 |          "total_num_steps": "auto",
18 |          "warmup_min_lr": "auto",
19 |          "warmup_max_lr": "auto",
20 |          "warmup_num_steps": "auto"
21 |         }
22 |     },
23 |     "zero_optimization": {
24 |         "stage": 3,
25 |         "overlap_comm": true,
26 |         "contiguous_gradients": true,
27 |         "sub_group_size": 1e9,
28 |         "reduce_bucket_size": "auto",
29 |         "stage3_prefetch_bucket_size": "auto",
30 |         "stage3_param_persistence_threshold": "auto",
31 |         "stage3_max_live_parameters": 1e9,
32 |         "stage3_max_reuse_distance": 1e9,
33 |         "stage3_gather_16bit_weights_on_model_save": true
34 |     },
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 1e5,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/ds_configs/stage3_no_offloading_accelerate.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": "auto"
 4 |     },
 5 |     "zero_optimization": {
 6 |         "stage": 3,
 7 |         "overlap_comm": true,
 8 |         "contiguous_gradients": true,
 9 |         "sub_group_size": 1e9,
10 |         "reduce_bucket_size": "auto",
11 |         "stage3_prefetch_bucket_size": "auto",
12 |         "stage3_param_persistence_threshold": "auto",
13 |         "stage3_max_live_parameters": 1e9,
14 |         "stage3_max_reuse_distance": 1e9,
15 |         "stage3_gather_16bit_weights_on_model_save": true
16 |     },
17 |     "gradient_accumulation_steps": "auto",
18 |     "gradient_clipping": "auto",
19 |     "steps_per_print": 1e5,
20 |     "train_batch_size": "auto",
21 |     "train_micro_batch_size_per_gpu": "auto",
22 |     "wall_clock_breakdown": false
23 | }


--------------------------------------------------------------------------------
/ds_configs/stage3_offloading.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": "auto"
 4 |     },
 5 |     "optimizer": {
 6 |         "type": "AdamW",
 7 |         "params": {
 8 |             "lr": "auto",
 9 |             "betas": "auto",
10 |             "eps": "auto",
11 |             "weight_decay": "auto"
12 |         }
13 |     },
14 |     "scheduler": {
15 |        "type": "WarmupDecayLR",
16 |        "params": {
17 |          "total_num_steps": "auto",
18 |          "warmup_min_lr": "auto",
19 |          "warmup_max_lr": "auto",
20 |          "warmup_num_steps": "auto"
21 |         }
22 |     },
23 |     "zero_optimization": {
24 |         "stage": 3,
25 |         "offload_optimizer": {
26 |             "device": "cpu",
27 |             "pin_memory": true
28 |         },
29 |         "offload_param": {
30 |             "device": "cpu",
31 |             "pin_memory": true
32 |         },
33 |         "overlap_comm": true,
34 |         "contiguous_gradients": true,
35 |         "sub_group_size": 1e9,
36 |         "reduce_bucket_size": "auto",
37 |         "stage3_prefetch_bucket_size": "auto",
38 |         "stage3_param_persistence_threshold": "auto",
39 |         "stage3_max_live_parameters": 1e9,
40 |         "stage3_max_reuse_distance": 1e9,
41 |         "stage3_gather_16bit_weights_on_model_save": true
42 |     },
43 |     "gradient_accumulation_steps": "auto",
44 |     "gradient_clipping": "auto",
45 |     "steps_per_print": 1e5,
46 |     "train_batch_size": "auto",
47 |     "train_micro_batch_size_per_gpu": "auto",
48 |     "wall_clock_breakdown": false
49 | }


--------------------------------------------------------------------------------
/ds_configs/stage3_offloading_accelerate.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": "auto"
 4 |     },
 5 |     "zero_optimization": {
 6 |         "stage": 3,
 7 |         "offload_optimizer": {
 8 |             "device": "cpu",
 9 |             "pin_memory": true
10 |         },
11 |         "offload_param": {
12 |             "device": "cpu",
13 |             "pin_memory": true
14 |         },
15 |         "overlap_comm": true,
16 |         "contiguous_gradients": true,
17 |         "sub_group_size": 1e9,
18 |         "reduce_bucket_size": "auto",
19 |         "stage3_prefetch_bucket_size": "auto",
20 |         "stage3_param_persistence_threshold": "auto",
21 |         "stage3_max_live_parameters": 1e9,
22 |         "stage3_max_reuse_distance": 1e9,
23 |         "stage3_gather_16bit_weights_on_model_save": true
24 |     },
25 |     "gradient_accumulation_steps": "auto",
26 |     "gradient_clipping": "auto",
27 |     "steps_per_print": 1e5,
28 |     "train_batch_size": "auto",
29 |     "train_micro_batch_size_per_gpu": "auto",
30 |     "wall_clock_breakdown": false
31 | }


--------------------------------------------------------------------------------
/eval/codex_humaneval/data.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, Dict
 2 | import gzip
 3 | import json
 4 | import os
 5 | 
 6 | 
 7 | ROOT = os.path.dirname(os.path.abspath(__file__))
 8 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
 9 | 
10 | 
11 | def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
12 |     return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
13 | 
14 | 
15 | def stream_jsonl(filename: str) -> Iterable[Dict]:
16 |     """
17 |     Parses each jsonl line and yields it as a dictionary
18 |     """
19 |     if filename.endswith(".gz"):
20 |         with open(filename, "rb") as gzfp:
21 |             with gzip.open(gzfp, 'rt') as fp:
22 |                 for line in fp:
23 |                     if any(not x.isspace() for x in line):
24 |                         yield json.loads(line)
25 |     else:
26 |         with open(filename, "r") as fp:
27 |             for line in fp:
28 |                 if any(not x.isspace() for x in line):
29 |                     yield json.loads(line)
30 | 
31 | 
32 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
33 |     """
34 |     Writes an iterable of dictionaries to jsonl
35 |     """
36 |     if append:
37 |         mode = 'ab'
38 |     else:
39 |         mode = 'wb'
40 |     filename = os.path.expanduser(filename)
41 |     if filename.endswith(".gz"):
42 |         with open(filename, mode) as fp:
43 |             with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
44 |                 for x in data:
45 |                     gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
46 |     else:
47 |         with open(filename, mode) as fp:
48 |             for x in data:
49 |                 fp.write((json.dumps(x) + "\n").encode('utf-8'))


--------------------------------------------------------------------------------
/eval/truthfulqa/configs.py:
--------------------------------------------------------------------------------
1 | # columns
2 | BEST_COL = 'Best Answer'
3 | ANSWER_COL = 'Correct Answers'
4 | INCORRECT_COL = 'Incorrect Answers'


--------------------------------------------------------------------------------
/eval/val_eval/make_ref.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from openai import OpenAI
 3 | from collections import Counter
 4 | from tqdm import tqdm
 5 | 
 6 | client = OpenAI()
 7 | 
 8 | def fetch_response(prompt):
 9 |     # Replace 'your_api_key_here' with your actual OpenAI API key
10 |     #openai.api_key = 'your_api_key_here'
11 |     response = client.chat.completions.create(
12 |         model="gpt-3.5-turbo",
13 |         messages =[{'role': 'user', 'content': prompt}],
14 |         max_tokens=1500
15 |     )
16 |     #print(response.choices[0].message.content)
17 |     return response.choices[0].message.content
18 | 
19 | 
20 | with open('val-gpt-3.5-turbo-ref.jsonl', 'w') as fout:
21 |   for line in tqdm(open('val.jsonl')):
22 |     line = json.loads(line)
23 |     prompt = line['instruction']
24 |     response = fetch_response(prompt)
25 |     example = {'messages': [{"role": "user", "content": prompt},
26 |       {"role": "assistant", "content": response}]}
27 |     print(json.dumps(example))
28 |     fout.write(json.dumps(example) + '\n')
29 | 


--------------------------------------------------------------------------------
/human_eval/data/eval_annotations_tulu_1.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-hewitt/implicit-ins/fe1c2d8d5d66e44bb7c51cf47a5921e08a4b500d/human_eval/data/eval_annotations_tulu_1.xlsx


--------------------------------------------------------------------------------
/human_eval/export_db.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import pandas as pd
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     # database connection
 7 |     DATABASE = "data/evaluation.db"
 8 |     DB_CONN = sqlite3.connect(DATABASE, check_same_thread=False)
 9 |     DB_CURSOR = DB_CONN.cursor()
10 | 
11 |     # export the evaluation results as excel
12 |     evaluation_results = pd.read_sql_query("SELECT * from evaluation_record", DB_CONN)
13 |     evaluation_results.to_excel("data/eval_annotations.xlsx", index=False)
14 | 
15 | 


--------------------------------------------------------------------------------
/human_eval/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | flask-sqlalchemy
3 | flask-login


--------------------------------------------------------------------------------
/human_eval/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-hewitt/implicit-ins/fe1c2d8d5d66e44bb7c51cf47a5921e08a4b500d/human_eval/screenshot.png


--------------------------------------------------------------------------------
/human_eval/static/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-hewitt/implicit-ins/fe1c2d8d5d66e44bb7c51cf47a5921e08a4b500d/human_eval/static/favicon.png


--------------------------------------------------------------------------------
/human_eval/static/styles.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |     font-family: Arial, Helvetica, sans-serif;
 3 | }
 4 | html {
 5 |     overflow-y:scroll;
 6 | }
 7 | xmp {
 8 |     font-family: Arial, Helvetica, sans-serif;
 9 | }
10 | #nav {
11 |     padding: 50px;
12 |     border-radius: 5px;
13 |     background-color: aliceblue;
14 |     min-height: 100vh;
15 | }
16 | #history-message-region {
17 |     padding: 20px;
18 |     border-radius: 5px;
19 |     margin: 10px 10px 10px 0;
20 |     background: oldlace;
21 |     height: 25vh;
22 |     min-height: 150px;
23 |     overflow: auto;
24 |     resize: vertical;
25 | }
26 | #model-outputs-region {
27 |     padding: 20px;
28 |     border-radius: 5px;
29 |     margin: 10px 10px 10px 0;
30 |     background: #cecefa;
31 | }
32 | #evaluation-region {
33 |     padding: 20px;
34 |     border-radius: 5px;
35 |     margin: 10px 10px 10px 0;
36 |     background: lavenderblush;
37 | }
38 | .message {
39 |     margin-bottom: 20px;
40 | }
41 | .icon-col {
42 |     max-width: 70px;
43 | }
44 | .role-icon {
45 |     border-radius: 50%;
46 |     width: 50px;
47 |     height: 50px;
48 |     font-size: 20px;
49 |     border: 1px solid #ddd;
50 |     background-color: white;
51 | }
52 | .message-col {
53 |     padding-top: 10px;
54 | }
55 | .message-text {
56 |     font-size: 18px;
57 |     margin: 0;
58 |     word-wrap: break-word;
59 |     white-space: pre-wrap;
60 | }
61 | /* .history-message-col {
62 |     border: #ddd solid 2px;
63 | } */
64 | .completion-icon {
65 |     border-radius: 50%;
66 |     width: 30px;
67 |     height: 30px;
68 |     font-size: 15px;
69 |     border: 1px solid #ddd;
70 |     background-color: #3e4cf1;
71 |     color: white;
72 | }
73 | .completion-col {
74 |     padding: 10px;
75 |     margin: 15px;
76 |     background-color: white;
77 |     height: 50vh;
78 |     overflow: auto;
79 |     min-height: 200px;
80 |     resize: vertical;
81 | }
82 | .eval-form-item {
83 |     margin-bottom: 20px;
84 | }


--------------------------------------------------------------------------------
/images/fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-hewitt/implicit-ins/fe1c2d8d5d66e44bb7c51cf47a5921e08a4b500d/images/fig1.png


--------------------------------------------------------------------------------
/images/tulu_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-hewitt/implicit-ins/fe1c2d8d5d66e44bb7c51cf47a5921e08a4b500d/images/tulu_logo.png


--------------------------------------------------------------------------------
/new_lima/check_responses.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from openai import OpenAI
 4 | from collections import Counter
 5 | from tqdm import tqdm
 6 | 
 7 | client = OpenAI()
 8 | 
 9 | def get_prompt(instruction, answer):
10 |   prompt = """Below is a pair of an instruction and a response. Your job is to tell if the response starts by rephrasing the instruction.
11 |   If the response starts by rephrasing the instruction, e.g., "Give me a recipe for Tiramisu" -- "Sure; here's a recipe for tiramisu:"
12 |   then output #### YES. Otherwise output #### NO.
13 | 
14 |   Instruction: {}
15 | 
16 |   Response: {}
17 |   """.format(instruction, answer)
18 |   return prompt
19 | 
20 | def fetch_response(prompt):
21 |     # Replace 'your_api_key_here' with your actual OpenAI API key
22 |     #openai.api_key = 'your_api_key_here'
23 |     response = client.chat.completions.create(
24 |         model="gpt-4-turbo",
25 |         messages =[{'role': 'user', 'content': prompt}],
26 |         max_tokens=150
27 |     )
28 |     print(response.choices[0].message.content)
29 |     return response.choices[0].message.content
30 | 
31 | c = Counter()
32 | 
33 | if os.path.exists('lima_checks.jsonl'):
34 |   with open('lima_checks.jsonl') as fin:
35 |     done_examples = [json.loads(x) for x in fin]
36 | else:
37 |   done_examples = []
38 | 
39 | with open('lima_checks.jsonl', 'a') as fout:
40 |     for i, line in tqdm(enumerate(open('lima_data.jsonl'))):
41 |         if i <= len(done_examples):
42 |           continue
43 |         line = json.loads(line)
44 |         messages = line['messages']
45 |         for message, next_message in zip(messages, messages[1:]):
46 |             if next_message['role'] == 'assistant':
47 |                 instruction = message['content']
48 |                 answer = next_message['content']
49 |                 message['check'] = fetch_response(get_prompt(instruction, answer))
50 |         fout.write(json.dumps(line) + '\n')
51 | 


--------------------------------------------------------------------------------
/new_lima/randomize_instructions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Takes an instruction-tuning dataset and makes an epoch-fixed randomized instruction dataset
 3 | """
 4 | import json
 5 | from collections import Counter
 6 | import random
 7 | 
 8 | 
 9 | c = Counter()
10 | 
11 | random.seed(89)
12 | 
13 | EPOCHS = 10
14 | 
15 | 
16 | # Gather all instructions
17 | instructions = []
18 | for line in open('lima_data.jsonl'):
19 |   line = json.loads(line)
20 |   messages = line['messages']
21 |   for message in messages:
22 |     if message['role'] == 'user':
23 |       instructions.append(message['content'])
24 | 
25 | with open('lima_random_instruction_{}_epoch.jsonl'.format(EPOCHS), 'w') as fout:
26 |   # once per epoch so over 1 "epoch" in the training loop we see a unique shuffle
27 |   for i in range(EPOCHS):
28 |     random.shuffle(instructions)
29 |     ins_iter = iter(instructions)
30 |     for line in open('lima_data.jsonl'):
31 |       line = json.loads(line)
32 |       messages = line['messages']
33 |       for message in messages:
34 |         if message['role'] == 'user':
35 |           message['content'] = next(ins_iter)
36 |       fout.write(json.dumps(line) + '\n')
37 | 


--------------------------------------------------------------------------------
/new_lima/remove_instructions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Takes an instruction-tuning dataset and makes (1) an empty instruction and (2) the last 
 3 | """
 4 | import json
 5 | from collections import Counter
 6 | c = Counter()
 7 | with open('lima_no_instruction.jsonl', 'w') as fout:
 8 |   for line in open('lima_data.jsonl'):
 9 |     line = json.loads(line)
10 |     messages = line['messages']
11 |     for message in messages:
12 |       if message['role'] == 'user':
13 |         message['content'] = ''
14 |     fout.write(json.dumps(line) + '\n')
15 | 


--------------------------------------------------------------------------------
/new_lima/remove_instructions_partial.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Takes an instruction-tuning dataset and makes (1) an empty instruction and (2) the last 
 3 | """
 4 | import json
 5 | from collections import Counter
 6 | c = Counter()
 7 | with open('partial_spec_no_instructions.jsonl', 'w') as fout:
 8 |   for line in open('partial_spec.jsonl'):
 9 |     line = json.loads(line)
10 |     messages = line['messages']
11 |     for message in messages:
12 |       if message['role'] == 'user':
13 |         message['content'] = ''
14 |     fout.write(json.dumps(line) + '\n')
15 | 


--------------------------------------------------------------------------------
/new_lima/remove_instructions_rephrased.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Takes an instruction-tuning dataset and makes (1) an empty instruction and (2) the last 
 3 | """
 4 | import json
 5 | from collections import Counter
 6 | c = Counter()
 7 | with open('lima_no_instruction_rephrased.jsonl', 'w') as fout:
 8 |   for line in open('lima_rephrased.jsonl'):
 9 |     line = json.loads(line)
10 |     messages = line['messages']
11 |     for message in messages:
12 |       if message['role'] == 'user':
13 |         message['content'] = ''
14 |     fout.write(json.dumps(line) + '\n')
15 | 


--------------------------------------------------------------------------------
/new_lima/remove_responses.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from openai import OpenAI
 3 | from collections import Counter
 4 | from tqdm import tqdm
 5 | 
 6 | client = OpenAI()
 7 | 
 8 | def get_prompt(doc):
 9 |   prompt = """Please generate a very concise instruction (at most 20 words) for a chatbot such that the _answer_ to that instruction is the paragraph below.
10 | 
11 |   Paragraph: 
12 | 
13 |   --------------------------------------------------------------
14 |   {}
15 |   --------------------------------------------------------------
16 | 
17 |   Now generate an instruction that would generate that paragraph.
18 |   """.format(doc)
19 |   return prompt
20 | 
21 | def fetch_response(prompt):
22 |     # Replace 'your_api_key_here' with your actual OpenAI API key
23 |     #openai.api_key = 'your_api_key_here'
24 |     response = client.chat.completions.create(
25 |         model="gpt-4-turbo",
26 |         messages =[{'role': 'user', 'content': prompt}],
27 |         max_tokens=150
28 |     )
29 |     print(response.choices[0].message.content)
30 |     return response.choices[0].message.content
31 | 
32 | c = Counter()
33 | 
34 | pretrain_examples = iter([json.loads(x) for x in open('pretrain.jsonl')])
35 | 
36 | with open('lima_no_responses.jsonl') as fin:
37 |   done_examples = [json.loads(x) for x in fin]
38 | 
39 | with open('lima_no_responses.jsonl', 'a') as fout:
40 |     for i, line in tqdm(enumerate(open('lima_data.jsonl'))):
41 |         if i <= len(done_examples):
42 |           continue
43 |         line = json.loads(line)
44 |         messages = line['messages']
45 |         for message, next_message in zip(messages, messages[1:]):
46 |             if next_message['role'] == 'assistant':
47 |                 answer = next(pretrain_examples)
48 |                 next_message['content'] = answer
49 |                 message['content'] = fetch_response(get_prompt(answer))
50 |         fout.write(json.dumps(line) + '\n')
51 | 


--------------------------------------------------------------------------------
/new_lima/rephrase_no_restate.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from openai import OpenAI
 4 | from collections import Counter
 5 | from tqdm import tqdm
 6 | 
 7 | client = OpenAI()
 8 | 
 9 | def get_prompt(instruction, answer):
10 |   prompt = """Below is a pair of an instruction and a response. The response starts by rephrasing the instruction.
11 |   Your job is to regenerate the response with the rephrasing of the instruction removed.
12 |   E.g., for instruction "Give me a recipe for Tiramisu" a rephrasing is "Sure; here's a recipe for tiramisu:"
13 |   You remove the rephrasing, and generate just the rest of the response.
14 | 
15 |   Instruction: {}
16 | 
17 |   Response: {}
18 |   """.format(instruction, answer)
19 |   return prompt
20 | 
21 | def fetch_response(prompt):
22 |     # Replace 'your_api_key_here' with your actual OpenAI API key
23 |     #openai.api_key = 'your_api_key_here'
24 |     response = client.chat.completions.create(
25 |         model="gpt-4-turbo",
26 |         messages =[{'role': 'user', 'content': prompt}],
27 |         max_tokens=2000
28 |     )
29 |     print(response.choices[0].message.content)
30 |     return response.choices[0].message.content
31 | 
32 | c = Counter()
33 | 
34 | #if os.path.exists('lima_checks.jsonl'):
35 | #  with open('lima_checks.jsonl') as fin:
36 | #    done_examples = [json.loads(x) for x in fin]
37 | #else:
38 | #  done_examples = []
39 | 
40 | with open('lima_rephrased.jsonl', 'w') as fout:
41 |     for i, line in tqdm(enumerate(open('lima_checks.jsonl'))):
42 |         #if i <= len(done_examples):
43 |         #  continue
44 |         line = json.loads(line)
45 |         messages = line['messages']
46 |         for message, next_message in zip(messages, messages[1:]):
47 |             if next_message['role'] == 'assistant':
48 |                 if message['check'].strip() == '#### YES':
49 |                   instruction = message['content']
50 |                   answer = next_message['content']
51 |                   next_message['content_old'] = answer
52 |                   next_message['content'] = fetch_response(get_prompt(instruction, answer))
53 |         fout.write(json.dumps(line) + '\n')
54 | 


--------------------------------------------------------------------------------
/open_instruct/gradio_demo.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | import torch
 3 | import sys
 4 | from transformers import AutoTokenizer, AutoModelForCausalLM
 5 | 
 6 | if len(sys.argv) > 1:
 7 |     model_name_or_path = sys.argv[1]
 8 | else:
 9 |     raise ValueError("Please provide a model name or path as the first argument")
10 | 
11 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
12 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
13 | 
14 | model.half().cuda()
15 | 
16 | def instruct(instruction):
17 |     with torch.inference_mode():
18 |         input_text = instruction
19 |         input_ids = tokenizer.encode(input_text, return_tensors='pt').cuda()
20 |         output_ids = model.generate(input_ids, max_length=1024)[0]
21 |         output_str = tokenizer.decode(output_ids[input_ids.shape[-1]:])
22 |         return output_str.strip()
23 | 
24 | demo = gr.Interface(
25 |     fn=instruct,
26 |     inputs=gr.Textbox(lines=10, placeholder="Enter your instruction here..."),
27 |     outputs="text", 
28 |     title="Demo for Open-Instruct", 
29 |     description="Model name or path: " + model_name_or_path
30 | )
31 | 
32 | demo.launch(share=True, server_port=7860)


--------------------------------------------------------------------------------
/open_instruct/utils.py:
--------------------------------------------------------------------------------
1 | #USER_TAG = '<|person1|>'
2 | #ASSISTANT_TAG = '<|person2|>'
3 | USER_TAG = '<|user|>'
4 | ASSISTANT_TAG = '<|assistant|>'
5 | #USER_TAG = '<|A|>'
6 | #ASSISTANT_TAG = '<|B|>'
7 | 


--------------------------------------------------------------------------------
/plot_ratios.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import sys
 5 | 
 6 | sft = [json.loads(x) for x in open(sys.argv[1])]
 7 | 
 8 | 
 9 | import transformers
10 | #tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T')
11 | 
12 | def get_ratios(ds):
13 |   data = {}
14 |   max_index = max([elt[0]['pair'][0][0] for elt in ds])
15 |   #print('Max index:', max_index)
16 |   for ins_index in range(max_index+1):
17 |     other_elts = []
18 |     for elt in ds:
19 |       if elt[0]['pair'] == [[ins_index, ins_index]]:
20 |         real_elt = elt
21 |       elif elt[0]['pair'][0][0] == ins_index:
22 |         other_elts.append(elt)
23 |     data[ins_index] = {'real_elt': real_elt, 'other_elts': other_elts}
24 | 
25 |   ## Print a few
26 |   #for index in range(5):
27 |   #  #real_string = tokenizer.decode(data[index]['real_elt'][0]['input_ids'][0])
28 |   #  real_prob = data[index]['real_elt'][1]
29 |   #  #print('--Real: {}--'.format(real_prob))
30 |   #  #print(real_string)
31 |   #  #print('----------')
32 | 
33 |   #  #print(len(data[index]['other_elts']))
34 |   #  #print(len(set(data[index]['other_elts'])))
35 |   #  for fake in sorted(data[index]['other_elts'], key=lambda x: -x[1]):
36 |   #    fake_string = tokenizer.decode(fake[0]['input_ids'][0])
37 |   #    fake_prob = fake[1]
38 |   #  #  print('--Fake: {}--'.format(fake_prob))
39 |   #  #  print(fake_string)
40 |   #  #print('-------------')
41 |   #  #print('-------------')
42 |   #  #print('-------------')
43 | 
44 |   ratios = []
45 |   for ins_index in data:
46 |     real_likelihood = data[ins_index]['real_elt'][1]
47 |     fake_likelihoods = [x[1] for x in data[ins_index]['other_elts']]
48 | 
49 |     likelihood_ratios = [real_likelihood-x for x in fake_likelihoods]
50 |     ratios.extend(likelihood_ratios)
51 |   return ratios
52 | 
53 | sft_ratios = get_ratios(sft)
54 | 
55 | x_values = sft_ratios
56 | print('Percent real scored higher than random', '{}'.format(sys.argv[1]), sum([x>0 for x in sft_ratios])/len(sft_ratios))
57 | 


--------------------------------------------------------------------------------
/print_one_example.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import json
 3 | import sys
 4 | i = int(sys.argv[1])
 5 | paths = sys.argv[2:]
 6 | 
 7 | for path in paths:
 8 |   data = open(path).read()
 9 |   try:
10 |     #data = json.load(open(path))
11 |     data = json.loads(data)
12 |     print(data[i]['instruction'])
13 |     print('|||')
14 |     print(data[i]['output'])
15 |   except Exception:
16 |     #print(data.split('\n')[0])
17 |     data = [json.loads(x) for x in data.strip().split('\n')]
18 |     print(data[i]['messages'][0]['content'])
19 |     print('|||')
20 |     print(data[i]['messages'][1]['content'])
21 | 


--------------------------------------------------------------------------------
/quantize/README.md:
--------------------------------------------------------------------------------
1 | # Compression
2 | 
3 | Model compression using GPTQ. We're going to rely on the AutoGPTQ code base: https://github.com/PanQiWei/AutoGPTQ.
4 | 


--------------------------------------------------------------------------------
/quantize/experiments/gptq_compress_llama_7b.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Kick off job to compress a smaller model so that we don't have to debug the huge one.
 3 | """
 4 | 
 5 | import beaker
 6 | from beaker import Beaker, ExperimentSpec, TaskSpec
 7 | 
 8 | beaker_client = Beaker.from_env(default_workspace="ai2/davidw")
 9 | 
10 | wkdir = "$NFS_HOME/proj/open-instruct/quantize"
11 | python_cmd = (
12 |     "python quantize_autogptq_wikitext.py "
13 |     "--pretrained_model_dir /net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B "
14 |     "--quantized_model_dir /net/nfs.cirrascale/allennlp/davidw/checkpoints/gptq_llama_7b"
15 | )
16 | 
17 | spec = ExperimentSpec(
18 |     description="GPTQ quantization.",
19 |     tasks=[
20 |         TaskSpec(
21 |             name="autogptq_llama_7b",
22 |             image=beaker.ImageSource(beaker="01GZHG16S90N033XP4D6BPC8NR"),
23 |             command=["bash", "-c", f"cd {wkdir}; {python_cmd}"],
24 |             result=beaker.ResultSpec(
25 |                 path="/unused"  # required even if the task produces no output.
26 |             ),
27 |             datasets=[
28 |                 beaker.DataMount(
29 |                     source=beaker.DataSource(host_path="/net/nfs.cirrascale"),
30 |                     mount_path="/net/nfs.cirrascale",
31 |                 )
32 |             ],
33 |             context=beaker.TaskContext(priority=beaker.Priority("high")),
34 |             constraints=beaker.Constraints(
35 |                 cluster=["ai2/s2-cirrascale", "ai2/allennlp-cirrascale"]
36 |             ),
37 |             env_vars=[
38 |                 beaker.EnvVar(
39 |                     name="NFS_HOME", value="/net/nfs.cirrascale/allennlp/davidw"
40 |                 ),
41 |                 beaker.EnvVar(
42 |                     name="HF_HOME",
43 |                     value="/net/nfs.cirrascale/allennlp/davidw/cache/huggingface"
44 |                 ),
45 |             ],
46 |             resources=beaker.TaskResources(gpu_count=1),
47 |         ),
48 |     ],
49 | )
50 | 
51 | experiment_name = "quantize"
52 | workspace_name = "ai2/davidw"
53 | 
54 | experiment = beaker_client.experiment.create(
55 |     experiment_name,
56 |     spec,
57 |     workspace=workspace_name,
58 | )
59 | 


--------------------------------------------------------------------------------
/quantize/scripts/eval_on_mmlu.sh:
--------------------------------------------------------------------------------
 1 | # export CUDA_VISIBLE_DEVICES=0
 2 | 
 3 | python -m eval.mmlu_eval.evaluate_hf_lm \
 4 |     --ntrain 0 \
 5 |     --data_dir data/mmlu \
 6 |     --save_dir results/mmlu/alpaca-65B-gptq-0shot/ \
 7 |     --model "/net/nfs.cirrascale/allennlp/davidw/checkpoints/gptq_alpaca_fixed_65b" \
 8 |     --tokenizer "/net/nfs.cirrascale/allennlp/hamishi/open-instruct/alpaca_fixed_65b" \
 9 |     --eval_batch_size 8 \
10 |     --gptq


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | scipy
 3 | packaging
 4 | sentencepiece
 5 | datasets
 6 | deepspeed
 7 | accelerate
 8 | peft>=0.4.0
 9 | bitsandbytes>=0.41.1
10 | evaluate>=0.4.0
11 | tokenizers>=0.13.3
12 | protobuf
13 | openai>=1.0.0
14 | tiktoken
15 | rouge_score
16 | tensorboard
17 | wandb
18 | gradio
19 | termcolor
20 | jsonlines
21 | unidic-lite
22 | einops
23 | flash-attn
24 | auto-gptq
25 | fire
26 | alpaca-eval
27 | # for human eval web app
28 | flask
29 | vllm 
30 | openpyxl
31 | # for ifeval
32 | nltk
33 | langdetect
34 | immutabledict
35 | 


--------------------------------------------------------------------------------
/scripts/convert_llama_weights_to_hf.sh:
--------------------------------------------------------------------------------
1 | LLAMA_FOLDER=/net/nfs.cirrascale/allennlp/jacobm/llama/llama/models
2 | 
3 | for MODEL_SIZE in 7B 13B 30B 65B; do
4 |     echo "Converting Llama ${MODEL_SIZE} to HuggingFace format"
5 |     python -m transformers.models.llama.convert_llama_weights_to_hf \
6 |     --input_dir $LLAMA_FOLDER/ \
7 |     --model_size $MODEL_SIZE \
8 |     --output_dir /net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/${MODEL_SIZE}
9 | done


--------------------------------------------------------------------------------
/scripts/do_preference_scripts.sh:
--------------------------------------------------------------------------------
1 | for model in output/limabaseline7Bep15_seed1/ output/olmolima3e-6baseline7Bep15_seed1/ meta-llama/Llama-2-7B-hf allenai/OLMo-7B-hf; do  python open_instruct/ratio_eval.py --model_name_or_path ${model} --tokenizer_name output/limabaseline7Bep15_seed1/  --train_file data/processed/stanford_alpaca/stanford_alpaca_data.jsonl --max_seq_length 1024  --per_device_train_batch_size 1 --max_examples 1000 --output_path `echo $model | sed 's|/|-|g'`.jsonl; done
2 | for model in output/limabaseline7Bep15_seed1/ output/olmolima3e-6baseline7Bep15_seed1/ meta-llama/Llama-2-7B-hf allenai/OLMo-7B-hf; do  plot_ratios.py $model; done
3 | 


--------------------------------------------------------------------------------
/scripts/dpo_train_with_accelerate.sh:
--------------------------------------------------------------------------------
 1 | # you need 8 GPUs for full finetuning
 2 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 3 | 
 4 | NUM_GPUS=8
 5 | BATCH_SIZE_PER_GPU=1
 6 | TOTAL_BATCH_SIZE=32
 7 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 8 | echo "Training model using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
 9 | 
10 | accelerate launch \
11 |     --mixed_precision bf16 \
12 |     --num_machines 1 \
13 |     --num_processes $NUM_GPUS \
14 |     --use_deepspeed \
15 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
16 |     open_instruct/dpo_tune.py \
17 |     --model_name_or_path allenai/tulu-2-7b \
18 |     --use_flash_attn \
19 |     --gradient_checkpointing \
20 |     --tokenizer_name allenai/tulu-2-7b \
21 |     --use_slow_tokenizer \
22 |     --dataset_name HuggingFaceH4/ultrafeedback_binarized \
23 |     --max_seq_length 2048 \
24 |     --preprocessing_num_workers 16 \
25 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
26 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
27 |     --learning_rate 5e-7 \
28 |     --lr_scheduler_type linear \
29 |     --warmup_ratio 0.1 \
30 |     --weight_decay 0. \
31 |     --num_train_epochs 3 \
32 |     --output_dir ~/dpo_7b_recreate2 \
33 |     --with_tracking \
34 |     --report_to tensorboard \
35 |     --logging_steps 1


--------------------------------------------------------------------------------
/scripts/dpo_train_with_qlora.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 2 | 
 3 | NUM_GPUS=8
 4 | BATCH_SIZE_PER_GPU=1
 5 | TOTAL_BATCH_SIZE=128
 6 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 7 | echo "Training model using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
 8 | 
 9 | # Lora training
10 | accelerate launch \
11 |     --num_machines 1 \
12 |     --num_processes $NUM_GPUS \
13 |     open_instruct/dpo_tune.py \
14 |     --model_name_or_path allenai/tulu-2-7b \
15 |     --use_qlora \
16 |     --use_lora \
17 |     --use_flash_attn \
18 |     --lora_rank 64 \
19 |     --lora_alpha 16 \
20 |     --lora_dropout 0.1 \
21 |     --tokenizer_name allenai/tulu-2-7b \
22 |     --use_slow_tokenizer \
23 |     --dataset_name HuggingFaceH4/ultrafeedback_binarized \
24 |     --max_seq_length 1024 \
25 |     --preprocessing_num_workers 128 \
26 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
27 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
28 |     --learning_rate 1e-4 \
29 |     --lr_scheduler_type linear \
30 |     --warmup_ratio 0.03 \
31 |     --weight_decay 0. \
32 |     --num_train_epochs 5 \
33 |     --output_dir output/tulu_v2_dpo_qlora/ \
34 |     --with_tracking \
35 |     --report_to tensorboard \
36 |     --logging_steps 1 &&
37 | 
38 | python open_instruct/merge_lora.py \
39 |     --base_model_name_or_path allenai/tulu-2-7b \
40 |     --lora_model_name_or_path output/tulu_v2_dpo_qlora/ \
41 |     --output_dir output/tulu_v2_dpo_qlora_merged/ \
42 |     --qlora \
43 |     --save_tokenizer
44 | 


--------------------------------------------------------------------------------
/scripts/dummy_length_scorer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Dummy evaluator that uses a given metric to determine winners in pairwise comparisons. Used to further investigate correlations.
 3 | '''
 4 | import argparse
 5 | from transformers import AutoTokenizer
 6 | from datasets import load_dataset
 7 | import json
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument("--candidate_file", type=str, help="Candidate file for candidate model outputs.")
11 | parser.add_argument("--metric", default="unique", type=str, help="Metric to use for comparison.")
12 | parser.add_argument("--tokenizer", default="/net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B", type=str, help="Tokenizer to use for tokenization.")
13 | args = parser.parse_args()
14 | 
15 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=False)
16 | 
17 | def count_unique_tokens(text):
18 |     return len(set(tokenizer(text).input_ids))
19 | 
20 | def count_token_length(text):
21 |     return len(tokenizer(text).input_ids)
22 | 
23 | metric_map = {
24 |     "unique": count_unique_tokens,
25 |     "length": count_token_length,
26 | }
27 | 
28 | if __name__ == "__main__":
29 |     # load reference data
30 |     reference_dataset = load_dataset("hamishivi/alpaca-farm-davinci-003-2048-token")
31 |     reference_dataset = [x["output"] for x in reference_dataset["train"]]
32 |     # load candidate data
33 |     with open(args.candidate_file, "r") as f:
34 |         candidate_dataset = json.load(f)
35 |         candidate_dataset = [x["output"] for x in candidate_dataset]
36 |     win_counter = 0
37 |     lose_counter = 0
38 |     tie_counter = 0
39 |     # compute metrics - we assume same order of reference and candidate data
40 |     for reference_sample, candidate_sample in zip(reference_dataset, candidate_dataset):
41 |         reference_metric = metric_map[args.metric](reference_sample)
42 |         candidate_metric = metric_map[args.metric](candidate_sample)
43 |         if reference_metric > candidate_metric:
44 |             lose_counter += 1
45 |         elif reference_metric < candidate_metric:
46 |             win_counter += 1
47 |         else:
48 |             tie_counter += 1
49 | 
50 |     print(f"{win_counter}\t{lose_counter}\t{tie_counter}")
51 | 


--------------------------------------------------------------------------------
/scripts/eval/alpaca_farm.sh:
--------------------------------------------------------------------------------
 1 | # Please make sure OPENAI_API_KEY is set in your environment variables
 2 | 
 3 | # Use V1 of alpaca farm evaluation.
 4 | export IS_ALPACA_EVAL_2=False
 5 | 
 6 | # use vllm for generation
 7 | python -m eval.alpaca_farm.run_eval \
 8 |     --model_name_or_path ../checkpoints/tulu_v1_7B/ \
 9 |     --tokenizer_name_or_path ../checkpoints/tulu_v1_7B/ \
10 |     --save_dir results/alpaca_farm/tulu_v1_7B/ \
11 |     --eval_batch_size 20 \
12 |     --use_vllm \
13 |     --use_chat_format \
14 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
15 | 
16 | 
17 | # use normal huggingface generation function
18 | python -m eval.alpaca_farm.run_eval \
19 |     --model_name_or_path ../checkpoints/tulu_v1_7B/ \
20 |     --tokenizer_name_or_path ../checkpoints/tulu_v1_7B/ \
21 |     --save_dir results/alpaca_farm/tulu_v1_7B/ \
22 |     --eval_batch_size 20 \
23 |     --use_chat_format \
24 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \
25 |     --load_in_8bit
26 | 


--------------------------------------------------------------------------------
/scripts/eval/alpaca_farm2.sh:
--------------------------------------------------------------------------------
 1 | # Please make sure OPENAI_API_KEY is set in your environment variables
 2 | 
 3 | # Use V1 of alpaca farm evaluation.
 4 | export IS_ALPACA_EVAL_2=False
 5 | 
 6 | # Evaluating LIMA baseline 7b model using and chat format
 7 | python -m eval.alpaca_farm.run_eval \
 8 |     --model_name_or_path output//lima_baseline_7B20e/ \
 9 |     --tokenizer_name_or_path output//lima_baseline_7B20e/ \
10 |     --save_dir results/alpaca_farm/lima_baseline_7B20e/ \
11 |     --eval_batch_size 20 \
12 |     --use_vllm \
13 |     --use_chat_format \
14 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
15 | 
16 | # Evaluating LIMA noins 7b model using and chat format
17 | python -m eval.alpaca_farm.run_eval \
18 |     --model_name_or_path output//lima_noins_7B20e/ \
19 |     --tokenizer_name_or_path output//lima_noins_7B20e/ \
20 |     --save_dir results/alpaca_farm/lima_noins_7B20e/ \
21 |     --eval_batch_size 20 \
22 |     --use_vllm \
23 |     --use_chat_format \
24 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
25 | 


--------------------------------------------------------------------------------
/scripts/eval/bbh.sh:
--------------------------------------------------------------------------------
 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation.
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | 
 5 | # evaluating baseline 7B model using chain-of-thought and chat format
 6 | python -m eval.bbh.run_eval \
 7 |     --data_dir data/eval/bbh \
 8 |     --save_dir results/bbh/lima_baseline_7B/ \
 9 |     --model output//lima_baseline_7B/ \
10 |     --tokenizer output//lima_baseline_7B/ \
11 |     --max_num_examples_per_task 40 \
12 |     --use_vllm \
13 |     --use_chat_format \
14 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
15 | 
16 | # evaluating noins 7B model using chain-of-thought and chat format
17 | python -m eval.bbh.run_eval \
18 |     --data_dir data/eval/bbh \
19 |     --save_dir results/bbh/lima_noins_7B/ \
20 |     --model output//lima_noins_7B/ \
21 |     --tokenizer output//lima_noins_7B/ \
22 |     --max_num_examples_per_task 40 \
23 |     --use_vllm \
24 |     --use_chat_format \
25 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/eval/bbh2.sh:
--------------------------------------------------------------------------------
 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation.
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | 
 5 | # evaluating baseline 7B model using chain-of-thought and chat format
 6 | python -m eval.bbh.run_eval \
 7 |     --data_dir data/eval/bbh \
 8 |     --save_dir results/bbh/lima_baseline_7B/ \
 9 |     --model output//lima_baseline_7B/ \
10 |     --tokenizer output//lima_baseline_7B/ \
11 |     --max_num_examples_per_task 40 \
12 |     --use_vllm \
13 |     --use_chat_format \
14 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
15 | 
16 | # evaluating noins 7B model using chain-of-thought and chat format
17 | python -m eval.bbh.run_eval \
18 |     --data_dir data/eval/bbh \
19 |     --save_dir results/bbh/lima_noins_7B/ \
20 |     --model output//lima_noins_7B/ \
21 |     --tokenizer output//lima_noins_7B/ \
22 |     --max_num_examples_per_task 40 \
23 |     --use_vllm \
24 |     --use_chat_format \
25 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/eval/gsm2.sh:
--------------------------------------------------------------------------------
 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation.
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | # Evaluating LIMA baseline 7b model using chain-of-thought and chat format
 5 | python -m eval.gsm.run_eval \
 6 |     --data_dir data/eval/gsm/ \
 7 |     --max_num_examples 200 \
 8 |     --save_dir results/gsm/lima_baseline_7B-cot-8shot \
 9 |     --model output//lima_baseline_7B/ \
10 |     --tokenizer output//lima_baseline_7B/ \
11 |     --n_shot 8 \
12 |     --use_chat_format \
13 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \
14 |     --use_vllm
15 | 
16 | # Evaluating LIMA noins 7b model using chain-of-thought and chat format
17 | python -m eval.gsm.run_eval \
18 |     --data_dir data/eval/gsm/ \
19 |     --max_num_examples 200 \
20 |     --save_dir results/gsm/lima_noins_7B-cot-8shot \
21 |     --model output//lima_noins_7B/ \
22 |     --tokenizer output//lima_noins_7B/ \
23 |     --n_shot 8 \
24 |     --use_chat_format \
25 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \
26 |     --use_vllm
27 | 
28 | 
29 | ## Evaluating llama2 chat model using chain-of-thought and chat format
30 | #python -m eval.gsm.run_eval \
31 | #    --data_dir data/eval/gsm/ \
32 | #    --max_num_examples 200 \
33 | #    --save_dir results/gsm/llama2-chat-7B-cot-8shot \
34 | #    --model ../hf_llama2_models/7B-chat \
35 | #    --tokenizer ../hf_llama2_models/7B-chat \
36 | #    --n_shot 8 \
37 | #    --use_chat_format \
38 | #    --chat_formatting_function eval.templates.create_prompt_with_llama2_chat_format \
39 | #    --use_vllm
40 | 


--------------------------------------------------------------------------------
/scripts/eval/ifeval.sh:
--------------------------------------------------------------------------------
 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation.
 2 |  export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | # Evaluating baseline 7B model using chat format
 5 | python -m eval.ifeval.run_eval \
 6 |     --data_dir data/eval/ifeval/ \
 7 |     --save_dir results/ifeval/lima_baseline_7B \
 8 |     --model output//lima_baseline_7B/ \
 9 |     --tokenizer output//lima_baseline_7B/ \
10 |     --use_chat_format \
11 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \
12 |     --use_vllm
13 | 
14 | # Evaluating noins 7B model using chat format
15 | python -m eval.ifeval.run_eval \
16 |     --data_dir data/eval/ifeval/ \
17 |     --save_dir results/ifeval/lima_noins_7B \
18 |     --model output//lima_noins_7B/ \
19 |     --tokenizer output//lima_noins_7B/ \
20 |     --use_chat_format \
21 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \
22 |     --use_vllm
23 | 
24 | 


--------------------------------------------------------------------------------
/scripts/eval/ifeval2.sh:
--------------------------------------------------------------------------------
 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation.
 2 |  export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | # Evaluating baseline 7B model using chat format
 5 | python -m eval.ifeval.run_eval \
 6 |     --data_dir data/eval/ifeval/ \
 7 |     --save_dir results/ifeval/lima_baseline_7B \
 8 |     --model output//lima_baseline_7B/ \
 9 |     --tokenizer output//lima_baseline_7B/ \
10 |     --use_chat_format \
11 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \
12 |     --use_vllm
13 | 
14 | # Evaluating noins 7B model using chat format
15 | python -m eval.ifeval.run_eval \
16 |     --data_dir data/eval/ifeval/ \
17 |     --save_dir results/ifeval/lima_noins_7B \
18 |     --model output//lima_noins_7B/ \
19 |     --tokenizer output//lima_noins_7B/ \
20 |     --use_chat_format \
21 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \
22 |     --use_vllm
23 | 
24 | 


--------------------------------------------------------------------------------
/scripts/eval/mmlu2.sh:
--------------------------------------------------------------------------------
 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation.
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | 
 5 | # Evaluating LIMA baseline 7B model using 0 shot and chat format
 6 | python -m eval.mmlu.run_eval \
 7 |     --ntrain 0 \
 8 |     --data_dir data/eval/mmlu \
 9 |     --save_dir results/mmlu/lima_baseline_7B-cot-0shot \
10 |     --model output//lima_baseline_7B/ \
11 |     --tokenizer output//lima_baseline_7B/ \
12 |     --eval_batch_size 4 \
13 |     --load_in_8bit \
14 |     --use_chat_format \
15 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
16 | 
17 | # Evaluating LIMA baseline 7B model using 5 shot and chat format
18 | python -m eval.mmlu.run_eval \
19 |     --ntrain 5 \
20 |     --data_dir data/eval/mmlu \
21 |     --save_dir results/mmlu/lima_baseline_7B-cot-5shot \
22 |     --model output//lima_baseline_7B/ \
23 |     --tokenizer output//lima_baseline_7B/ \
24 |     --eval_batch_size 4 \
25 |     --load_in_8bit \
26 |     --use_chat_format \
27 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
28 | 
29 | 
30 | 
31 | # Evaluating LIMA noins 7B model using 0 shot and chat format
32 | python -m eval.mmlu.run_eval \
33 |     --ntrain 0 \
34 |     --data_dir data/eval/mmlu \
35 |     --save_dir results/mmlu/lima_noins_7B-cot-0shot \
36 |     --model output//lima_noins_7B/ \
37 |     --tokenizer output//lima_noins_7B/ \
38 |     --eval_batch_size 4 \
39 |     --load_in_8bit \
40 |     --use_chat_format \
41 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
42 | 
43 | # Evaluating LIMA noins 7B model using 5 shot and chat format
44 | python -m eval.mmlu.run_eval \
45 |     --ntrain 5 \
46 |     --data_dir data/eval/mmlu \
47 |     --save_dir results/mmlu/lima_noins_7B-cot-5shot \
48 |     --model output//lima_noins_7B/ \
49 |     --tokenizer output//lima_noins_7B/ \
50 |     --eval_batch_size 4 \
51 |     --load_in_8bit \
52 |     --use_chat_format \
53 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
54 | 


--------------------------------------------------------------------------------
/scripts/eval/toxigen.sh:
--------------------------------------------------------------------------------
 1 | # example scripts for toxigen
 2 | 
 3 | # evaluate an open-instruct model with chat format
 4 | python -m eval.toxigen.run_eval \
 5 |     --data_dir data/eval/toxigen/ \
 6 |     --save_dir tulu_65b \
 7 |     --model_name_or_path tulu_65b/ \
 8 |     --use_vllm \
 9 |     --use_chat_format \
10 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
11 | 
12 | 
13 | # evaluate a base model without chat format
14 | python -m eval.toxigen.run_eval \
15 |     --data_dir data/eval/toxigen/ \
16 |     --save_dir tulu_65b \
17 |     --model_name_or_path tulu_65b/ \
18 |     --use_vllm
19 | 
20 | 
21 | # evaluate chatGPT
22 | python -m eval.toxigen.run_eval \
23 |     --data_dir data/eval/toxigen/ \
24 |     --save_dir results/toxigen/chatgpt \
25 |     --openai_engine gpt-3.5-turbo-0301 \
26 |     --max_prompts_per_group 100 \
27 |     --eval_batch_size 20
28 | 
29 | 
30 | # evaluate gpt4
31 | python -m eval.toxigen.run_eval \
32 |     --data_dir data/eval/toxigen/ \
33 |     --save_dir results/toxigen/gpt4 \
34 |     --openai_engine gpt-4-0314 \
35 |     --max_prompts_per_group 100 \
36 |     --eval_batch_size 20


--------------------------------------------------------------------------------
/scripts/eval/xstest.sh:
--------------------------------------------------------------------------------
 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation.
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | 
 5 | # Evaluating tulu 7B model using chat format
 6 | python -m eval.xstest.run_eval \
 7 |     --data_dir data/eval/xstest/ \
 8 |     --save_dir results/xstest/tulu-7B-sft \
 9 |     --model ../checkpoints/tulu2/7B-sft \
10 |     --tokenizer ../checkpoints/tulu2/7B-sft \
11 |     --use_chat_format \
12 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \
13 |     --use_vllm
14 | 
15 | 
16 | # Evaluating tulu 70B dpo model using chat format
17 | python -m eval.xstest.run_eval \
18 |     --data_dir data/eval/xstest/ \
19 |     --save_dir results/xstest/tulu-70B-dpo \
20 |     --model allenai/tulu-2-dpo-70b \
21 |     --tokenizer allenai/tulu-2-dpo-70b \
22 |     --use_chat_format \
23 |     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \
24 |     --use_vllm
25 | 
26 | 
27 | # Evaluating chatgpt
28 | python -m eval.xstest.run_eval \
29 |     --data_dir data/eval/xstest/ \
30 |     --save_dir results/xstest/chatgpt-no-cot \
31 |     --openai_engine "gpt-3.5-turbo-0125" \
32 |     --eval_batch_size 20
33 | 
34 | 
35 | # Evaluating gpt4
36 | python -m eval.xstest.run_eval \
37 |     --data_dir data/eval/xstest/ \
38 |     --save_dir results/xstest/gpt4-cot \
39 |     --openai_engine "gpt-4-0613" \
40 |     --eval_batch_size 20


--------------------------------------------------------------------------------
/scripts/finetune_lora_with_accelerate.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | MODEL_SIZE=7B
 4 | NUM_GPUS=4
 5 | BATCH_SIZE_PER_GPU=1
 6 | TOTAL_BATCH_SIZE=128
 7 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 8 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
 9 | 
10 | # Lora training
11 | accelerate launch \
12 |     --mixed_precision bf16 \
13 |     --num_machines 1 \
14 |     --num_processes $NUM_GPUS \
15 |     --use_deepspeed \
16 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
17 |     open_instruct/finetune.py \
18 |     --model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \
19 |     --use_flash_attn \
20 |     --use_lora \
21 |     --lora_rank 64 \
22 |     --lora_alpha 16 \
23 |     --lora_dropout 0.1 \
24 |     --tokenizer_name ../hf_llama2_models/${MODEL_SIZE} \
25 |     --use_slow_tokenizer \
26 |     --train_file oasst1_data.jsonl \
27 |     --max_seq_length 4096 \
28 |     --preprocessing_num_workers 16 \
29 |     --checkpointing_steps epoch \
30 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
31 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
32 |     --learning_rate 1e-4 \
33 |     --lr_scheduler_type linear \
34 |     --warmup_ratio 0.03 \
35 |     --weight_decay 0. \
36 |     --num_train_epochs 5 \
37 |     --output_dir output/tulu_v2_${MODEL_SIZE}_lora/ \
38 |     --with_tracking \
39 |     --report_to tensorboard \
40 |     --logging_steps 1 &&
41 | 
42 | python open_instruct/merge_lora.py \
43 |     --base_model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \
44 |     --lora_model_name_or_path output/tulu_v2_${MODEL_SIZE}_lora/ \
45 |     --output_dir output/tulu_v2_${MODEL_SIZE}_lora_merged/ \
46 |     --save_tokenizer
47 | 


--------------------------------------------------------------------------------
/scripts/finetune_qlora_with_accelerate.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 2 | 
 3 | MODEL_SIZE=70B
 4 | NUM_GPUS=8
 5 | BATCH_SIZE_PER_GPU=1
 6 | TOTAL_BATCH_SIZE=128
 7 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 8 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
 9 | 
10 | # Lora training
11 | accelerate launch \
12 |     --num_machines 1 \
13 |     --num_processes $NUM_GPUS \
14 |     open_instruct/finetune.py \
15 |     --model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \
16 |     --gradient_checkpointing \
17 |     --use_qlora \
18 |     --use_lora \
19 |     --use_flash_attn \
20 |     --lora_rank 64 \
21 |     --lora_alpha 16 \
22 |     --lora_dropout 0.1 \
23 |     --tokenizer_name ../hf_llama2_models/${MODEL_SIZE} \
24 |     --use_slow_tokenizer \
25 |     --train_file data/processed/tulu_v2/tulu_v2_data.jsonl \
26 |     --max_seq_length 4096 \
27 |     --preprocessing_num_workers 128 \
28 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
29 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
30 |     --learning_rate 1e-4 \
31 |     --lr_scheduler_type linear \
32 |     --warmup_ratio 0.03 \
33 |     --weight_decay 0. \
34 |     --num_train_epochs 5 \
35 |     --output_dir output/tulu_v2_${MODEL_SIZE}_qlora/ \
36 |     --with_tracking \
37 |     --report_to tensorboard \
38 |     --logging_steps 1 &&
39 | 
40 | python open_instruct/merge_lora.py \
41 |     --base_model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \
42 |     --lora_model_name_or_path output/tulu_v2_${MODEL_SIZE}_qlora/ \
43 |     --output_dir output/tulu_v2_${MODEL_SIZE}_qlora_merged/ \
44 |     --qlora \
45 |     --save_tokenizer
46 | 


--------------------------------------------------------------------------------
/scripts/finetune_with_accelerate.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | MODEL_SIZE=7B
 4 | NUM_GPUS=4
 5 | BATCH_SIZE_PER_GPU=1
 6 | TOTAL_BATCH_SIZE=128
 7 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 8 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
 9 | 
10 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
11 | # but it will trade off speed.
12 | accelerate launch \
13 |     --mixed_precision bf16 \
14 |     --num_machines 1 \
15 |     --num_processes $NUM_GPUS \
16 |     --use_deepspeed \
17 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
18 |     open_instruct/finetune.py \
19 |     --model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \
20 |     --use_flash_attn \
21 |     --tokenizer_name ../hf_llama2_models/${MODEL_SIZE} \
22 |     --use_slow_tokenizer \
23 |     --train_file data/processed/tulu_v2/tulu_v2_data.jsonl \
24 |     --max_seq_length 8192 \
25 |     --preprocessing_num_workers 128 \
26 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
27 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
28 |     --learning_rate 2e-5 \
29 |     --lr_scheduler_type linear \
30 |     --warmup_ratio 0.03 \
31 |     --weight_decay 0. \
32 |     --num_train_epochs 2 \
33 |     --output_dir output/tulu_v2_${MODEL_SIZE}/ \
34 |     --with_tracking \
35 |     --report_to tensorboard \
36 |     --logging_steps 1


--------------------------------------------------------------------------------
/scripts/finetune_with_hf_trainer.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | MODEL_SIZE=7B
 4 | NUM_GPUS=4
 5 | BATCH_SIZE_PER_GPU=1
 6 | TOTAL_BATCH_SIZE=128
 7 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 8 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
 9 | 
10 | deepspeed --include localhost:0,1,2,3 open_instruct/finetune_trainer.py \
11 |     --deepspeed ds_configs/stage3_no_offloading.conf \
12 |     --model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \
13 |     --tokenizer_name ../hf_llama2_models/${MODEL_SIZE} \
14 |     --use_flash_attn True \
15 |     --use_fast_tokenizer False \
16 |     --train_file data/processed/tulu_v2/tulu_v2_data.jsonl \
17 |     --max_seq_length 8192 \
18 |     --preprocessing_num_workers 64 \
19 |     --do_train \
20 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
21 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
22 |     --learning_rate 2e-5 \
23 |     --lr_scheduler_type linear \
24 |     --warmup_ratio 0.03 \
25 |     --weight_decay 0. \
26 |     --evaluation_strategy "no" \
27 |     --logging_steps 1 \
28 |     --save_strategy epoch \
29 |     --save_total_limit 1 \
30 |     --num_train_epochs 2 \
31 |     --output_dir output/tulu_v2_${MODEL_SIZE}/ \
32 |     --bf16 \
33 |     --tf32 True \
34 |     --torch_dtype bfloat16 \
35 |     --overwrite_output_dir \
36 |     --report_to "tensorboard"


--------------------------------------------------------------------------------
/scripts/get_statistics.sh:
--------------------------------------------------------------------------------
1 | # ["super_ni", "cot", "flan_v2", "self_instruct", "unnatural_instructions", "stanford_alpaca", "dolly", "sharegpt", "code_alpaca", "gpt4_alpaca", "baize", "oasst1"]
2 | 
3 | # for every dataset, get the statistics
4 | for dataset in super_ni cot flan_v2 self_instruct unnatural_instructions stanford_alpaca dolly sharegpt code_alpaca gpt4_alpaca baize oasst1 lima wizardlm open_orca; do
5 |     echo "Getting statistics for $dataset..."
6 |     python open_instruct/get_statistics.py --data_path data/processed/${dataset}/${dataset}_data.jsonl --save_path data/processed/${dataset}/${dataset}_statistics.json
7 | done


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_llama/ins/finetune_10e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima
18 | epochs=10
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29507 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_data.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_llama/ins/finetune_15e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | 
 4 | export CUDA_VISIBLE_DEVICES=0,1
 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 6 | 
 7 | MODEL_SIZE=7B
 8 | MODELNAME=baseline
 9 | NUM_GPUS=2
10 | BATCH_SIZE_PER_GPU=1
11 | TOTAL_BATCH_SIZE=64
12 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
13 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
14 | 
15 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
16 | # but it will trade off speed.
17 | 
18 | DSNAME=lima
19 | epochs=15
20 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
21 | 
22 | accelerate launch \
23 |     --mixed_precision bf16 \
24 |     --num_machines 1 \
25 |     --num_processes $NUM_GPUS \
26 |     --use_deepspeed \
27 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
28 |     --main_process_port 29508 \
29 |     open_instruct/finetune.py \
30 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
31 |     --use_flash_attn \
32 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
33 |     --use_slow_tokenizer \
34 |     --train_file data/processed/lima/lima_data.jsonl \
35 |     --max_seq_length 2048 \
36 |     --preprocessing_num_workers 128 \
37 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
38 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
39 |     --learning_rate 1e-5 \
40 |     --lr_scheduler_type linear \
41 |     --warmup_ratio 0.03 \
42 |     --weight_decay 0. \
43 |     --num_train_epochs ${epochs} \
44 |     --output_dir output/${model}/ \
45 |     --with_tracking \
46 |     --report_to tensorboard \
47 |     --logging_steps 1
48 | 
49 | 
50 | export CUDA_VISIBLE_DEVICES=0
51 | 
52 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
53 | 
54 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
55 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_llama/ins/finetune_20e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima
18 | epochs=20
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29509 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_data.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_llama/ins/finetune_5e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | 
 4 | export CUDA_VISIBLE_DEVICES=0,1
 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 6 | 
 7 | MODEL_SIZE=7B
 8 | MODELNAME=baseline
 9 | NUM_GPUS=2
10 | BATCH_SIZE_PER_GPU=1
11 | TOTAL_BATCH_SIZE=64
12 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
13 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
14 | 
15 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
16 | # but it will trade off speed.
17 | 
18 | DSNAME=lima
19 | epochs=5
20 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
21 | 
22 | accelerate launch \
23 |     --mixed_precision bf16 \
24 |     --num_machines 1 \
25 |     --num_processes $NUM_GPUS \
26 |     --use_deepspeed \
27 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
28 |     --main_process_port 29510 \
29 |     open_instruct/finetune.py \
30 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
31 |     --use_flash_attn \
32 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
33 |     --use_slow_tokenizer \
34 |     --train_file data/processed/lima/lima_data.jsonl \
35 |     --max_seq_length 2048 \
36 |     --preprocessing_num_workers 128 \
37 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
38 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
39 |     --learning_rate 1e-5 \
40 |     --lr_scheduler_type linear \
41 |     --warmup_ratio 0.03 \
42 |     --weight_decay 0. \
43 |     --num_train_epochs ${epochs} \
44 |     --output_dir output/${model}/ \
45 |     --with_tracking \
46 |     --report_to tensorboard \
47 |     --logging_steps 1
48 | 
49 | 
50 | export CUDA_VISIBLE_DEVICES=0
51 | 
52 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
53 | 
54 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
55 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_llama/ins/finetune_7e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | 
 4 | export CUDA_VISIBLE_DEVICES=0,1
 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 6 | 
 7 | MODEL_SIZE=7B
 8 | MODELNAME=baseline
 9 | NUM_GPUS=2
10 | BATCH_SIZE_PER_GPU=1
11 | TOTAL_BATCH_SIZE=64
12 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
13 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
14 | 
15 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
16 | # but it will trade off speed.
17 | 
18 | DSNAME=lima
19 | epochs=7
20 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
21 | 
22 | accelerate launch \
23 |     --mixed_precision bf16 \
24 |     --num_machines 1 \
25 |     --num_processes $NUM_GPUS \
26 |     --use_deepspeed \
27 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
28 |     --main_process_port 29511 \
29 |     open_instruct/finetune.py \
30 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
31 |     --use_flash_attn \
32 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
33 |     --use_slow_tokenizer \
34 |     --train_file data/processed/lima/lima_data.jsonl \
35 |     --max_seq_length 2048 \
36 |     --preprocessing_num_workers 128 \
37 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
38 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
39 |     --learning_rate 1e-5 \
40 |     --lr_scheduler_type linear \
41 |     --warmup_ratio 0.03 \
42 |     --weight_decay 0. \
43 |     --num_train_epochs ${epochs} \
44 |     --output_dir output/${model}/ \
45 |     --with_tracking \
46 |     --report_to tensorboard \
47 |     --logging_steps 1
48 | 
49 | 
50 | export CUDA_VISIBLE_DEVICES=0
51 | 
52 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
53 | 
54 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
55 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_llama/run_expt_val.sh:
--------------------------------------------------------------------------------
 1 | # Validation 
 2 | ## Baseline (Instruction Tuning)
 3 | for f in scripts/iclr2025/expt1/res/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31    $f; done
 4 | ## Response Tuning
 5 | for f in scripts/iclr2025/expt1/ins/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31    $f; done
 6 | 
 7 | # Test
 8 | # Baseline seeds
 9 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31  scripts/iclr2025/expt1/ins/finetune_seed${seed}.sh; done
10 | # Response tuning seeds
11 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31  scripts/iclr2025/expt1/res/finetune_seed${seed}.sh; done
12 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_olmo/ins/finetune_10e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima3e-6
18 | epochs=10
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29507 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_data.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_olmo/ins/finetune_15e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima3e-6
18 | epochs=15
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29508 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_data.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_olmo/ins/finetune_20e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima3e-6
18 | epochs=20
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29509 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_data.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_olmo/ins/finetune_5e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima3e-6
18 | epochs=5
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29510 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_data.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_olmo/ins/finetune_7e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima3e-6
18 | epochs=7
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29511 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_data.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_olmo/res/finetune_10e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=response
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima3e-6
18 | epochs=10
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29507 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_no_instruction.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_olmo/res/finetune_15e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=response
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima3e-6
18 | epochs=15
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29508 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_no_instruction.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_olmo/res/finetune_20e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=response
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima3e-6
18 | epochs=20
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29509 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_no_instruction.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_olmo/res/finetune_5e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=response
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=lima3e-6
18 | epochs=5
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29510 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/lima/lima_no_instruction.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt1_olmo/run_expt_val.sh:
--------------------------------------------------------------------------------
 1 | # Validation 
 2 | ## Baseline (Instruction Tuning)
 3 | for f in scripts/iclr2025/expt1/res/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31    $f; done
 4 | ## Response Tuning
 5 | for f in scripts/iclr2025/expt1/ins/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31    $f; done
 6 | 
 7 | # Test
 8 | # Baseline seeds
 9 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31  scripts/iclr2025/expt1/ins/finetune_seed${seed}.sh; done
10 | # Response tuning seeds
11 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31  scripts/iclr2025/expt1/res/finetune_seed${seed}.sh; done
12 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/gsm/finetune_10e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm
18 | epochs=10
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29507 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/gsm/finetune_15e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm
18 | epochs=15
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29508 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/gsm/finetune_20e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm
18 | epochs=20
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29509 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/gsm/finetune_5e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm
18 | epochs=5
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29510 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/gsm/finetune_7e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm
18 | epochs=7
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29511 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/mbpp/finetune_10e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=mbpp
18 | epochs=10
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29507 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/mbpp/mbpp.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/mbpp/finetune_15e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=mbpp
18 | epochs=15
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29508 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/mbpp/mbpp.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/mbpp/finetune_20e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=mbpp
18 | epochs=20
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29509 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/mbpp/mbpp.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/mbpp/finetune_5e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=mbpp
18 | epochs=5
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29510 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/mbpp/mbpp.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/mbpp/finetune_7e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=mbpp
18 | epochs=7
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29511 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/mbpp/mbpp.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/pgn/finetune_10e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=pgn
18 | epochs=10
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29507 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/pgn/pgn.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/pgn/finetune_15e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=pgn
18 | epochs=15
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29508 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/pgn/pgn.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/pgn/finetune_20e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=pgn
18 | epochs=20
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29509 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/pgn/pgn.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/pgn/finetune_5e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=pgn
18 | epochs=5
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29510 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/pgn/pgn.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_llama/pgn/finetune_7e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=pgn
18 | epochs=7
19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29511 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path meta-llama/Llama-2-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name meta-llama/Llama-2-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/pgn/pgn.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 1e-5 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/gsm/finetune_10e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm3e-6
18 | epochs=10
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29507 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/gsm/finetune_15e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm3e-6
18 | epochs=15
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29508 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/gsm/finetune_20e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm3e-6
18 | epochs=20
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29509 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/gsm/finetune_2e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm3e-6
18 | epochs=2
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29510 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/gsm/finetune_5e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm3e-6
18 | epochs=5
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29510 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/gsm/finetune_7e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=gsm3e-6
18 | epochs=7
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29511 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/gsm_train/gsm8k.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/mbpp/finetune_10e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=mbpp3e-6
18 | epochs=10
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29507 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/mbpp/mbpp.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/mbpp/finetune_15e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=mbpp3e-6
18 | epochs=15
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29508 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/mbpp/mbpp.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/mbpp/finetune_20e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=mbpp3e-6
18 | epochs=20
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29509 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/mbpp/mbpp.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/mbpp/finetune_5e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=mbpp3e-6
18 | epochs=5
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29510 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/mbpp/mbpp.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/mbpp/finetune_7e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | 
 4 | export CUDA_VISIBLE_DEVICES=0,1
 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 6 | 
 7 | MODEL_SIZE=7B
 8 | MODELNAME=baseline
 9 | NUM_GPUS=2
10 | BATCH_SIZE_PER_GPU=1
11 | TOTAL_BATCH_SIZE=64
12 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
13 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
14 | 
15 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
16 | # but it will trade off speed.
17 | 
18 | DSNAME=mbpp3e-6
19 | epochs=7
20 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
21 | 
22 | accelerate launch \
23 |     --mixed_precision bf16 \
24 |     --num_machines 1 \
25 |     --num_processes $NUM_GPUS \
26 |     --use_deepspeed \
27 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
28 |     --main_process_port 29511 \
29 |     open_instruct/finetune.py \
30 |     --model_name_or_path allenai/OLMo-7B-hf \
31 |     --use_flash_attn \
32 |     --tokenizer_name allenai/OLMo-7B-hf \
33 |     --use_slow_tokenizer \
34 |     --train_file data/processed/mbpp/mbpp.jsonl \
35 |     --max_seq_length 2048 \
36 |     --preprocessing_num_workers 128 \
37 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
38 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
39 |     --learning_rate 3e-6 \
40 |     --lr_scheduler_type linear \
41 |     --warmup_ratio 0.03 \
42 |     --weight_decay 0. \
43 |     --num_train_epochs ${epochs} \
44 |     --output_dir output/${model}/ \
45 |     --with_tracking \
46 |     --report_to tensorboard \
47 |     --logging_steps 1
48 | 
49 | 
50 | export CUDA_VISIBLE_DEVICES=0
51 | 
52 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
53 | 
54 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
55 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/pgn/finetune_10e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=pgn3e-6
18 | epochs=10
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29507 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/pgn/pgn.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/pgn/finetune_15e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=pgn3e-6
18 | epochs=15
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29508 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/pgn/pgn.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/pgn/finetune_20e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=pgn3e-6
18 | epochs=20
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29509 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/pgn/pgn.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/pgn/finetune_5e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=pgn3e-6
18 | epochs=5
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29510 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/pgn/pgn.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/pgn/finetune_7e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=pgn3e-6
18 | epochs=7
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29511 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/pgn/pgn.jsonl \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/poetry/finetune_10e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=poetry3e-6
18 | epochs=10
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29507 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/poetry/poetry.jsonl  \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/poetry/finetune_20e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=poetry3e-6
18 | epochs=20
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29509 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/poetry/poetry.jsonl  \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/poetry/finetune_5e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=poetry3e-6
18 | epochs=5
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29510 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/poetry/poetry.jsonl  \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/poetry/finetune_7e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 5 | 
 6 | MODEL_SIZE=7B
 7 | MODELNAME=baseline
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | 
17 | DSNAME=poetry3e-6
18 | epochs=7
19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
20 | 
21 | accelerate launch \
22 |     --mixed_precision bf16 \
23 |     --num_machines 1 \
24 |     --num_processes $NUM_GPUS \
25 |     --use_deepspeed \
26 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
27 |     --main_process_port 29511 \
28 |     open_instruct/finetune.py \
29 |     --model_name_or_path allenai/OLMo-7B-hf \
30 |     --use_flash_attn \
31 |     --tokenizer_name allenai/OLMo-7B-hf \
32 |     --use_slow_tokenizer \
33 |     --train_file data/processed/poetry/poetry.jsonl  \
34 |     --max_seq_length 2048 \
35 |     --preprocessing_num_workers 128 \
36 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
37 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
38 |     --learning_rate 3e-6 \
39 |     --lr_scheduler_type linear \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0. \
42 |     --num_train_epochs ${epochs} \
43 |     --output_dir output/${model}/ \
44 |     --with_tracking \
45 |     --report_to tensorboard \
46 |     --logging_steps 1
47 | 
48 | 
49 | export CUDA_VISIBLE_DEVICES=0
50 | 
51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
52 | 
53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
54 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/expt2_olmo/recipe/finetune_7e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | export CUDA_VISIBLE_DEVICES=0,1
 3 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 4 | 
 5 | MODEL_SIZE=7B
 6 | MODELNAME=baseline
 7 | NUM_GPUS=2
 8 | BATCH_SIZE_PER_GPU=1
 9 | TOTAL_BATCH_SIZE=64
10 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
11 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
12 | 
13 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
14 | # but it will trade off speed.
15 | 
16 | DSNAME=recipe3e-6
17 | epochs=7
18 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs}
19 | 
20 | accelerate launch \
21 |     --mixed_precision bf16 \
22 |     --num_machines 1 \
23 |     --num_processes $NUM_GPUS \
24 |     --use_deepspeed \
25 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
26 |     --main_process_port 29511 \
27 |     open_instruct/finetune.py \
28 |     --model_name_or_path allenai/OLMo-7B-hf \
29 |     --use_flash_attn \
30 |     --tokenizer_name allenai/OLMo-7B-hf \
31 |     --use_slow_tokenizer \
32 |     --train_file data/processed/kaggle_food_recipes/kfr.jsonl \
33 |     --max_seq_length 2048 \
34 |     --preprocessing_num_workers 128 \
35 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
36 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
37 |     --learning_rate 3e-6 \
38 |     --lr_scheduler_type linear \
39 |     --warmup_ratio 0.03 \
40 |     --weight_decay 0. \
41 |     --num_train_epochs ${epochs} \
42 |     --output_dir output/${model}/ \
43 |     --with_tracking \
44 |     --report_to tensorboard \
45 |     --logging_steps 1
46 | 
47 | 
48 | export CUDA_VISIBLE_DEVICES=0
49 | 
50 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/  --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/      --eval_batch_size 10          --use_chat_format     --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm
51 | 
52 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json
53 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/no_rephrase_expt/run_expt_val.sh:
--------------------------------------------------------------------------------
 1 | # Validation 
 2 | ## Baseline (Instruction Tuning)
 3 | for f in scripts/iclr2025/expt1/res/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31    $f; done
 4 | ## Response Tuning
 5 | for f in scripts/iclr2025/expt1/ins/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31    $f; done
 6 | 
 7 | # Test
 8 | # Baseline seeds
 9 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31  scripts/iclr2025/expt1/ins/finetune_seed${seed}.sh; done
10 | # Response tuning seeds
11 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31  scripts/iclr2025/expt1/res/finetune_seed${seed}.sh; done
12 | 


--------------------------------------------------------------------------------
/scripts/iclr2025/other_tags_expt/run_expt_val.sh:
--------------------------------------------------------------------------------
 1 | # Validation 
 2 | ## Baseline (Instruction Tuning)
 3 | for f in scripts/iclr2025/expt1/res/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31    $f; done
 4 | ## Response Tuning
 5 | for f in scripts/iclr2025/expt1/ins/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31    $f; done
 6 | 
 7 | # Test
 8 | # Baseline seeds
 9 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31  scripts/iclr2025/expt1/ins/finetune_seed${seed}.sh; done
10 | # Response tuning seeds
11 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4    --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31  scripts/iclr2025/expt1/res/finetune_seed${seed}.sh; done
12 | 


--------------------------------------------------------------------------------
/scripts/prepare_science_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Mix together all datasets to create instruction tuning mix.
 3 | """
 4 | 
 5 | from pathlib import Path
 6 | import json
 7 | import os
 8 | 
 9 | 
10 | def write_jsonl(xs, fname):
11 |     with open(fname, "w") as f:
12 |         for x in xs:
13 |             print(json.dumps(x), file=f)
14 | 
15 | 
16 | def load_jsonl(fname):
17 |     with open(fname) as f:
18 |         return [json.loads(line) for line in f]
19 | 
20 | 
21 | names = [
22 |     "evidence_inference",
23 |     "qasper_truncated_4000",
24 |     "scifact_json",
25 |     "scitldr_aic",
26 |     "scierc_ner",
27 |     "scierc_relation"
28 | ]
29 | 
30 | # This is an instruction dataset about several science tasks that David and some other collaborators created.
31 | # Please contact us if you want to use the raw files
32 | data_dir = Path("../../davidw/proj/science-instruct/promptsource-sciit/prompts_davidw/tasks")
33 | out_dir = Path("data/raw_train/science")
34 | os.makedirs(out_dir, exist_ok=True)
35 | 
36 | full_dataset = []
37 | 
38 | for name in names:
39 |     ds = load_jsonl(data_dir / f"{name}_train.jsonl")
40 |     for entry in ds:
41 |         entry["dataset"] = name
42 |         full_dataset.append(entry)
43 | 
44 | write_jsonl(full_dataset, out_dir / "science_train.jsonl")


--------------------------------------------------------------------------------
/scripts/prepare_train_data.sh:
--------------------------------------------------------------------------------
 1 | # Downloading same as open-instruct
 2 | # check if there is $HF_TOKEN in the environment variables
 3 | if [ -z "$HF_TOKEN" ]
 4 | then
 5 |     echo "Warning: HuggingFace dataset LIMA requires permissive access."
 6 |     echo "Warning: Please request the access at https://huggingface.co/datasets/GAIR/lima and set the HF_TOKEN environment variable before running this script."
 7 |     exit 1
 8 | fi
 9 | 
10 | echo "Downloading Stanford alpaca data..."
11 | wget -P data/raw_train/stanford_alpaca/ https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json
12 | 
13 | 
14 | echo "Downloading LIMA dataset..."
15 | wget --header="Authorization: Bearer $HF_TOKEN" -P data/raw_train/lima/ https://huggingface.co/datasets/GAIR/lima/raw/main/train.jsonl
16 | 
17 | echo "Processing datasets..."
18 | python open_instruct/reformat_datasets.py --raw_data_dir data/raw_train/ --output_dir data/processed/
19 | 
20 | # Now download and process datasets specific to this repository.
21 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_no_ins_try2.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     open_instruct/finetune.py \
20 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
21 |     --use_flash_attn \
22 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
23 |     --use_slow_tokenizer \
24 |     --train_file data/processed/lima/lima_no_instruction.jsonl \
25 |     --max_seq_length 2048 \
26 |     --preprocessing_num_workers 128 \
27 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
28 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
29 |     --learning_rate 1e-5 \
30 |     --lr_scheduler_type linear \
31 |     --warmup_ratio 0.03 \
32 |     --weight_decay 0. \
33 |     --num_train_epochs 15 \
34 |     --output_dir output/lima_noins_${MODEL_SIZE}_try2/ \
35 |     --with_tracking \
36 |     --report_to tensorboard \
37 |     --logging_steps 1
38 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_no_ins_try3.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     open_instruct/finetune.py \
20 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
21 |     --use_flash_attn \
22 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
23 |     --use_slow_tokenizer \
24 |     --train_file data/processed/lima/lima_no_instruction.jsonl \
25 |     --max_seq_length 2048 \
26 |     --preprocessing_num_workers 128 \
27 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
28 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
29 |     --learning_rate 1e-5 \
30 |     --lr_scheduler_type linear \
31 |     --warmup_ratio 0.03 \
32 |     --weight_decay 0. \
33 |     --num_train_epochs 10 \
34 |     --output_dir output/lima_noins_${MODEL_SIZE}_try3/ \
35 |     --with_tracking \
36 |     --report_to tensorboard \
37 |     --logging_steps 1
38 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_no_ins_try4.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     open_instruct/finetune.py \
20 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
21 |     --use_flash_attn \
22 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
23 |     --use_slow_tokenizer \
24 |     --train_file data/processed/lima/lima_no_instruction.jsonl \
25 |     --max_seq_length 2048 \
26 |     --preprocessing_num_workers 128 \
27 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
28 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
29 |     --learning_rate 1e-5 \
30 |     --lr_scheduler_type linear \
31 |     --warmup_ratio 0.03 \
32 |     --weight_decay 0. \
33 |     --num_train_epochs 20 \
34 |     --output_dir output/lima_noins_${MODEL_SIZE}_try4/ \
35 |     --with_tracking \
36 |     --report_to tensorboard \
37 |     --logging_steps 1
38 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_no_ins_try5.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     open_instruct/finetune.py \
20 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
21 |     --use_flash_attn \
22 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
23 |     --use_slow_tokenizer \
24 |     --train_file data/processed/lima/lima_no_instruction.jsonl \
25 |     --max_seq_length 2048 \
26 |     --preprocessing_num_workers 128 \
27 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
28 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
29 |     --learning_rate 1e-5 \
30 |     --lr_scheduler_type linear \
31 |     --warmup_ratio 0.03 \
32 |     --weight_decay 0. \
33 |     --num_train_epochs 30 \
34 |     --output_dir output/lima_noins_${MODEL_SIZE}_try5/ \
35 |     --with_tracking \
36 |     --report_to tensorboard \
37 |     --logging_steps 1
38 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_no_ins_try6.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     open_instruct/finetune.py \
20 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
21 |     --use_flash_attn \
22 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
23 |     --use_slow_tokenizer \
24 |     --train_file data/processed/lima/lima_no_instruction.jsonl \
25 |     --max_seq_length 2048 \
26 |     --preprocessing_num_workers 128 \
27 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
28 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
29 |     --learning_rate 1e-5 \
30 |     --lr_scheduler_type linear \
31 |     --warmup_ratio 0.03 \
32 |     --weight_decay 0. \
33 |     --num_train_epochs 7 \
34 |     --output_dir output/lima_noins_${MODEL_SIZE}_try6/ \
35 |     --with_tracking \
36 |     --report_to tensorboard \
37 |     --logging_steps 1
38 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_no_ins_try7.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     open_instruct/finetune.py \
20 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
21 |     --use_flash_attn \
22 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
23 |     --use_slow_tokenizer \
24 |     --train_file data/processed/lima/lima_no_instruction.jsonl \
25 |     --max_seq_length 2048 \
26 |     --preprocessing_num_workers 128 \
27 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
28 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
29 |     --learning_rate 1e-5 \
30 |     --lr_scheduler_type linear \
31 |     --warmup_ratio 0.03 \
32 |     --weight_decay 0. \
33 |     --num_train_epochs 5 \
34 |     --output_dir output/lima_noins_${MODEL_SIZE}_try7/ \
35 |     --with_tracking \
36 |     --report_to tensorboard \
37 |     --logging_steps 1
38 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_noins_plus_partial_try2.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     open_instruct/finetune.py \
20 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
21 |     --use_flash_attn \
22 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
23 |     --use_slow_tokenizer \
24 |     --train_file data/processed/lima/lima_noins_plus_partial.jsonl \
25 |     --max_seq_length 2048 \
26 |     --preprocessing_num_workers 128 \
27 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
28 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
29 |     --learning_rate 1e-5 \
30 |     --lr_scheduler_type linear \
31 |     --warmup_ratio 0.03 \
32 |     --weight_decay 0. \
33 |     --num_train_epochs 15 \
34 |     --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try2/ \
35 |     --with_tracking \
36 |     --report_to tensorboard \
37 |     --logging_steps 1
38 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_noins_plus_partial_try3.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     open_instruct/finetune.py \
20 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
21 |     --use_flash_attn \
22 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
23 |     --use_slow_tokenizer \
24 |     --train_file data/processed/lima/lima_noins_plus_partial.jsonl \
25 |     --max_seq_length 2048 \
26 |     --preprocessing_num_workers 128 \
27 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
28 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
29 |     --learning_rate 1e-5 \
30 |     --lr_scheduler_type linear \
31 |     --warmup_ratio 0.03 \
32 |     --weight_decay 0. \
33 |     --num_train_epochs 10 \
34 |     --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try3/ \
35 |     --with_tracking \
36 |     --report_to tensorboard \
37 |     --logging_steps 1
38 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_noins_plus_partial_try4.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     open_instruct/finetune.py \
20 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
21 |     --use_flash_attn \
22 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
23 |     --use_slow_tokenizer \
24 |     --train_file data/processed/lima/lima_noins_plus_partial.jsonl \
25 |     --max_seq_length 2048 \
26 |     --preprocessing_num_workers 128 \
27 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
28 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
29 |     --learning_rate 1e-5 \
30 |     --lr_scheduler_type linear \
31 |     --warmup_ratio 0.03 \
32 |     --weight_decay 0. \
33 |     --num_train_epochs 20 \
34 |     --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try4/ \
35 |     --with_tracking \
36 |     --report_to tensorboard \
37 |     --logging_steps 1
38 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_noins_plus_partial_try5.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     open_instruct/finetune.py \
20 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
21 |     --use_flash_attn \
22 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
23 |     --use_slow_tokenizer \
24 |     --train_file data/processed/lima/lima_noins_plus_partial.jsonl \
25 |     --max_seq_length 2048 \
26 |     --preprocessing_num_workers 128 \
27 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
28 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
29 |     --learning_rate 1e-5 \
30 |     --lr_scheduler_type linear \
31 |     --warmup_ratio 0.03 \
32 |     --weight_decay 0. \
33 |     --num_train_epochs 30 \
34 |     --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try5/ \
35 |     --with_tracking \
36 |     --report_to tensorboard \
37 |     --logging_steps 1
38 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_noins_plus_partial_try6.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     --main_process_port 29506 \
20 |     open_instruct/finetune.py \
21 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
22 |     --use_flash_attn \
23 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
24 |     --use_slow_tokenizer \
25 |     --train_file data/processed/lima/lima_noins_plus_partial.jsonl \
26 |     --max_seq_length 2048 \
27 |     --preprocessing_num_workers 128 \
28 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
29 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
30 |     --learning_rate 1e-5 \
31 |     --lr_scheduler_type linear \
32 |     --warmup_ratio 0.03 \
33 |     --weight_decay 0. \
34 |     --num_train_epochs 7 \
35 |     --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try6/ \
36 |     --with_tracking \
37 |     --report_to tensorboard \
38 |     --logging_steps 1
39 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_noins_plus_partial_try7.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     --main_process_port 29506 \
20 |     open_instruct/finetune.py \
21 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
22 |     --use_flash_attn \
23 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
24 |     --use_slow_tokenizer \
25 |     --train_file data/processed/lima/lima_noins_plus_partial.jsonl \
26 |     --max_seq_length 2048 \
27 |     --preprocessing_num_workers 128 \
28 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
29 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
30 |     --learning_rate 1e-5 \
31 |     --lr_scheduler_type linear \
32 |     --warmup_ratio 0.03 \
33 |     --weight_decay 0. \
34 |     --num_train_epochs 5 \
35 |     --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try7/ \
36 |     --with_tracking \
37 |     --report_to tensorboard \
38 |     --logging_steps 1
39 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_try2.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | conda activate poi5
 4 | export CUDA_VISIBLE_DEVICES=0,1
 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 6 | 
 7 | MODEL_SIZE=7B
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | accelerate launch \
17 |     --mixed_precision bf16 \
18 |     --num_machines 1 \
19 |     --num_processes $NUM_GPUS \
20 |     --use_deepspeed \
21 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
22 |     --main_process_port 29509 \
23 |     open_instruct/finetune.py \
24 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
25 |     --use_flash_attn \
26 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
27 |     --use_slow_tokenizer \
28 |     --train_file data/processed/lima/lima_data.jsonl \
29 |     --max_seq_length 2048 \
30 |     --preprocessing_num_workers 128 \
31 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
32 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
33 |     --learning_rate 1e-5 \
34 |     --lr_scheduler_type linear \
35 |     --warmup_ratio 0.03 \
36 |     --weight_decay 0. \
37 |     --num_train_epochs 15 \
38 |     --output_dir output/lima_baseline_${MODEL_SIZE}try2/ \
39 |     --with_tracking \
40 |     --report_to tensorboard \
41 |     --logging_steps 1
42 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_try3.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | conda activate poi5
 4 | export CUDA_VISIBLE_DEVICES=0,1
 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 6 | 
 7 | MODEL_SIZE=7B
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | accelerate launch \
17 |     --mixed_precision bf16 \
18 |     --num_machines 1 \
19 |     --num_processes $NUM_GPUS \
20 |     --use_deepspeed \
21 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
22 |     --main_process_port 29509 \
23 |     open_instruct/finetune.py \
24 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
25 |     --use_flash_attn \
26 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
27 |     --use_slow_tokenizer \
28 |     --train_file data/processed/lima/lima_data.jsonl \
29 |     --max_seq_length 2048 \
30 |     --preprocessing_num_workers 128 \
31 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
32 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
33 |     --learning_rate 1e-5 \
34 |     --lr_scheduler_type linear \
35 |     --warmup_ratio 0.03 \
36 |     --weight_decay 0. \
37 |     --num_train_epochs 10 \
38 |     --output_dir output/lima_baseline_${MODEL_SIZE}try3/ \
39 |     --with_tracking \
40 |     --report_to tensorboard \
41 |     --logging_steps 1
42 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_try4.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | conda activate poi5
 4 | export CUDA_VISIBLE_DEVICES=0,1
 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 6 | 
 7 | MODEL_SIZE=7B
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | accelerate launch \
17 |     --mixed_precision bf16 \
18 |     --num_machines 1 \
19 |     --num_processes $NUM_GPUS \
20 |     --use_deepspeed \
21 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
22 |     --main_process_port 29509 \
23 |     open_instruct/finetune.py \
24 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
25 |     --use_flash_attn \
26 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
27 |     --use_slow_tokenizer \
28 |     --train_file data/processed/lima/lima_data.jsonl \
29 |     --max_seq_length 2048 \
30 |     --preprocessing_num_workers 128 \
31 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
32 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
33 |     --learning_rate 1e-5 \
34 |     --lr_scheduler_type linear \
35 |     --warmup_ratio 0.03 \
36 |     --weight_decay 0. \
37 |     --num_train_epochs 20 \
38 |     --output_dir output/lima_baseline_${MODEL_SIZE}try4/ \
39 |     --with_tracking \
40 |     --report_to tensorboard \
41 |     --logging_steps 1
42 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_try5.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1
 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 3 | 
 4 | MODEL_SIZE=7B
 5 | NUM_GPUS=2
 6 | BATCH_SIZE_PER_GPU=1
 7 | TOTAL_BATCH_SIZE=64
 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
10 | 
11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
12 | # but it will trade off speed.
13 | accelerate launch \
14 |     --mixed_precision bf16 \
15 |     --num_machines 1 \
16 |     --num_processes $NUM_GPUS \
17 |     --use_deepspeed \
18 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
19 |     --main_process_port 29512 \
20 |     open_instruct/finetune.py \
21 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
22 |     --use_flash_attn \
23 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
24 |     --use_slow_tokenizer \
25 |     --train_file data/processed/lima/lima_data.jsonl \
26 |     --max_seq_length 2048 \
27 |     --preprocessing_num_workers 128 \
28 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
29 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
30 |     --learning_rate 1e-5 \
31 |     --lr_scheduler_type linear \
32 |     --warmup_ratio 0.03 \
33 |     --weight_decay 0. \
34 |     --num_train_epochs 30 \
35 |     --output_dir output/lima_baseline_${MODEL_SIZE}try5/ \
36 |     --with_tracking \
37 |     --report_to tensorboard \
38 |     --logging_steps 1
39 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_try6.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | conda activate poi5
 4 | export CUDA_VISIBLE_DEVICES=0,1
 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 6 | 
 7 | MODEL_SIZE=7B
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | accelerate launch \
17 |     --mixed_precision bf16 \
18 |     --num_machines 1 \
19 |     --num_processes $NUM_GPUS \
20 |     --use_deepspeed \
21 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
22 |     --main_process_port 29509 \
23 |     open_instruct/finetune.py \
24 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
25 |     --use_flash_attn \
26 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
27 |     --use_slow_tokenizer \
28 |     --train_file data/processed/lima/lima_data.jsonl \
29 |     --max_seq_length 2048 \
30 |     --preprocessing_num_workers 128 \
31 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
32 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
33 |     --learning_rate 1e-5 \
34 |     --lr_scheduler_type linear \
35 |     --warmup_ratio 0.03 \
36 |     --weight_decay 0. \
37 |     --num_train_epochs 7 \
38 |     --output_dir output/lima_baseline_${MODEL_SIZE}try6/ \
39 |     --with_tracking \
40 |     --report_to tensorboard \
41 |     --logging_steps 1
42 | 


--------------------------------------------------------------------------------
/scripts/sweep/7B/finetune_try7.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | conda activate poi5
 4 | export CUDA_VISIBLE_DEVICES=0,1
 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3
 6 | 
 7 | MODEL_SIZE=7B
 8 | NUM_GPUS=2
 9 | BATCH_SIZE_PER_GPU=1
10 | TOTAL_BATCH_SIZE=64
11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
13 | 
14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 
15 | # but it will trade off speed.
16 | accelerate launch \
17 |     --mixed_precision bf16 \
18 |     --num_machines 1 \
19 |     --num_processes $NUM_GPUS \
20 |     --use_deepspeed \
21 |     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
22 |     --main_process_port 29509 \
23 |     open_instruct/finetune.py \
24 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
25 |     --use_flash_attn \
26 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
27 |     --use_slow_tokenizer \
28 |     --train_file data/processed/lima/lima_data.jsonl \
29 |     --max_seq_length 2048 \
30 |     --preprocessing_num_workers 128 \
31 |     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
32 |     --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
33 |     --learning_rate 1e-5 \
34 |     --lr_scheduler_type linear \
35 |     --warmup_ratio 0.03 \
36 |     --weight_decay 0. \
37 |     --num_train_epochs 5 \
38 |     --output_dir output/lima_baseline_${MODEL_SIZE}try7/ \
39 |     --with_tracking \
40 |     --report_to tensorboard \
41 |     --logging_steps 1
42 | 


--------------------------------------------------------------------------------
/weight-diff-requirements.txt:
--------------------------------------------------------------------------------
1 | fire
2 | torch
3 | tqdm
4 | transformers
5 | accelerate
6 | sentencepiece
7 | protobuf==3.20.0
8 | 


--------------------------------------------------------------------------------