├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── beaker_configs ├── alpaca_7B.yaml ├── alpaca_7B_lora.yaml ├── default_eval.yaml ├── default_finetune.yaml ├── default_finetune_lora_multinode.yaml ├── default_finetune_multinode.yaml ├── default_finetune_qlora_multinode.yaml └── run_weight_diff.sh ├── data └── processed │ ├── gsm_train │ └── make_gsm.py │ ├── kaggle_food_recipes │ └── make_kfr.py │ ├── lima │ ├── check_responses.py │ ├── randomize_instructions.py │ ├── remove_instructions.py │ ├── remove_instructions_rephrased.py │ └── rephrase_no_restate.py │ ├── mbpp │ └── make_mbpp.py │ ├── pgn │ └── make_pgn.py │ └── poetry │ └── make_poetry.py ├── ds_configs ├── stage3_no_offloading.conf ├── stage3_no_offloading_accelerate.conf ├── stage3_offloading.conf └── stage3_offloading_accelerate.conf ├── eval ├── alpaca_farm │ └── run_eval.py ├── bbh │ └── run_eval.py ├── codex_humaneval │ ├── data.py │ ├── evaluation.py │ ├── execution.py │ └── run_eval.py ├── dispatch_openai_requests.py ├── gsm │ ├── examplars.py │ └── run_eval.py ├── ifeval │ ├── instructions.py │ ├── instructions_registry.py │ ├── instructions_util.py │ └── run_eval.py ├── mmlu │ ├── categories.py │ └── run_eval.py ├── predict.py ├── templates.py ├── toxigen │ └── run_eval.py ├── truthfulqa │ ├── configs.py │ ├── metrics.py │ ├── presets.py │ ├── run_eval.py │ └── utilities.py ├── tydiqa │ └── run_eval.py ├── utils.py ├── val_eval │ ├── make_ref.py │ ├── run_eval.py │ ├── val-gpt-3.5-turbo-ref.json │ ├── val-gpt3.5-2.json │ ├── val-gpt3.5.json │ └── val.jsonl └── xstest │ ├── classify_refusal.py │ └── run_eval.py ├── human_eval ├── README.md ├── app.py ├── compute_metrics.py ├── data │ ├── eval_annotations_tulu_1.xlsx │ └── eval_instances_tulu_1.jsonl ├── export_db.py ├── requirements.txt ├── screenshot.png ├── static │ ├── app.js │ ├── favicon.png │ └── styles.css └── templates │ ├── index.html │ └── login.html ├── images ├── fig1.png └── tulu_logo.png ├── model_licenses ├── llama_license.txt ├── opt_license.txt ├── pythia_license.txt └── tulu_license.txt ├── new_lima ├── check_responses.py ├── lima_all.jsonl ├── lima_both.jsonl ├── lima_checks.jsonl ├── lima_data.jsonl ├── lima_no_instruction.jsonl ├── lima_no_instruction_plus_refusal.jsonl ├── lima_no_instruction_plus_refusal_and_qualities.jsonl ├── lima_no_instruction_plus_refusal_and_rejection.jsonl ├── lima_no_instruction_plus_refusal_no_instructions.jsonl ├── lima_no_instruction_rephrased.jsonl ├── lima_no_responses.jsonl ├── lima_noins_plus_partial.jsonl ├── lima_plus_refusal.jsonl ├── lima_random_instruction_10_epoch.jsonl ├── lima_rephrased.jsonl ├── make_partial_data.py ├── partial_spec.jsonl ├── partial_spec2.jsonl ├── partial_spec_no_instructions.jsonl ├── pretrain.jsonl ├── qualities.jsonl ├── qualities_partial.jsonl ├── randomize_instructions.py ├── refusal_selective.jsonl ├── remove_instructions.py ├── remove_instructions_partial.py ├── remove_instructions_rephrased.py ├── remove_responses.py └── rephrase_no_restate.py ├── open_instruct ├── combined_model.py ├── dpo_tune.py ├── dpo_utils.py ├── finetune.py ├── finetune_trainer.py ├── get_statistics.py ├── gradio_demo.py ├── gradio_demo_chat.py ├── instruction_encode_templates.py ├── merge_lora.py ├── plot_embeds.py ├── ratio_eval.py ├── reformat_datasets.py ├── run_interact.py ├── safe_save_trainer.py └── utils.py ├── plot_ratios.py ├── print_one_example.py ├── quantize ├── README.md ├── experiments │ └── gptq_compress_llama_7b.py ├── quantize_autogptq_wikitext.py └── scripts │ └── eval_on_mmlu.sh ├── requirements.txt ├── scripts ├── collect_eval_results.py ├── convert_llama_weights_to_hf.sh ├── do_preference_scripts.sh ├── dpo_train_with_accelerate.sh ├── dpo_train_with_qlora.sh ├── dummy_length_scorer.py ├── eval │ ├── alpaca_farm.sh │ ├── alpaca_farm2.sh │ ├── bbh.sh │ ├── bbh2.sh │ ├── codex_humaneval.sh │ ├── codex_humaneval2.sh │ ├── eval.sh │ ├── gsm.sh │ ├── gsm2.sh │ ├── ifeval.sh │ ├── ifeval2.sh │ ├── mmlu.sh │ ├── mmlu2.sh │ ├── toxigen.sh │ ├── trutufulqa.sh │ ├── trutufulqa2.sh │ ├── tydiqa.sh │ ├── tydiqa2.sh │ └── xstest.sh ├── finetune_lora_with_accelerate.sh ├── finetune_qlora_with_accelerate.sh ├── finetune_with_accelerate.sh ├── finetune_with_hf_trainer.sh ├── get_statistics.sh ├── iclr2025 │ ├── expt1_llama │ │ ├── ins │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ ├── res │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ └── run_expt_val.sh │ ├── expt1_olmo │ │ ├── ins │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ ├── res │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ └── run_expt_val.sh │ ├── expt2_llama │ │ ├── gsm │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ ├── mbpp │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ ├── pgn │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ ├── poetry │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ └── recipe │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ ├── expt2_olmo │ │ ├── gsm │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_2e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ ├── mbpp │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ ├── pgn │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ ├── poetry │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ └── recipe │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ ├── expt3_llama │ │ └── finetune_gsm_checkpoint.sh │ ├── no_rephrase_expt │ │ ├── ins │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ ├── res │ │ │ ├── finetune_10e.sh │ │ │ ├── finetune_15e.sh │ │ │ ├── finetune_20e.sh │ │ │ ├── finetune_5e.sh │ │ │ ├── finetune_7e.sh │ │ │ ├── finetune_seed1.sh │ │ │ ├── finetune_seed2.sh │ │ │ ├── finetune_seed3.sh │ │ │ ├── finetune_seed4.sh │ │ │ └── finetune_seed5.sh │ │ └── run_expt_val.sh │ └── other_tags_expt │ │ ├── ins │ │ ├── finetune_10e.sh │ │ ├── finetune_15e.sh │ │ ├── finetune_20e.sh │ │ ├── finetune_5e.sh │ │ ├── finetune_7e.sh │ │ ├── finetune_seed1.sh │ │ ├── finetune_seed2.sh │ │ ├── finetune_seed3.sh │ │ ├── finetune_seed4.sh │ │ └── finetune_seed5.sh │ │ ├── res │ │ ├── finetune_10e.sh │ │ ├── finetune_15e.sh │ │ ├── finetune_20e.sh │ │ ├── finetune_5e.sh │ │ ├── finetune_7e.sh │ │ ├── finetune_seed1.sh │ │ ├── finetune_seed2.sh │ │ ├── finetune_seed3.sh │ │ ├── finetune_seed4.sh │ │ └── finetune_seed5.sh │ │ └── run_expt_val.sh ├── olmo │ ├── finetune_no_ins_try7.sh │ ├── finetune_noins_plus_partial_try6.sh │ └── finetune_try4.sh ├── prepare_eval_data.sh ├── prepare_science_data.py ├── prepare_train_data.sh ├── resample_flan_v2.py ├── split_sharegpt_conversations.py ├── submit_eval_jobs.py ├── submit_finetune_jobs.py ├── sweep │ ├── 13B │ │ ├── finetune_13b_10epoch.sh │ │ ├── finetune_13b_20epoch.sh │ │ ├── finetune_13b_30epoch.sh │ │ ├── finetune_13b_7epoch.sh │ │ ├── finetune_13b_noins_10epoch.sh │ │ ├── finetune_13b_noins_20epoch.sh │ │ ├── finetune_13b_noins_30epoch.sh │ │ ├── finetune_13b_noins_7epoch.sh │ │ ├── finetune_13b_noins_plus_partial_10epoch.sh │ │ ├── finetune_13b_noins_plus_partial_20epoch.sh │ │ ├── finetune_13b_noins_plus_partial_30epoch.sh │ │ └── finetune_13b_noins_plus_partial_7epoch.sh │ ├── 1B │ │ ├── finetune_13b_10epoch.sh │ │ ├── finetune_13b_20epoch.sh │ │ ├── finetune_13b_30epoch.sh │ │ ├── finetune_13b_7epoch.sh │ │ ├── finetune_13b_noins_10epoch.sh │ │ ├── finetune_13b_noins_20epoch.sh │ │ ├── finetune_13b_noins_30epoch.sh │ │ ├── finetune_13b_noins_7epoch.sh │ │ ├── finetune_13b_noins_plus_partial_10epoch.sh │ │ ├── finetune_13b_noins_plus_partial_20epoch.sh │ │ ├── finetune_13b_noins_plus_partial_30epoch.sh │ │ └── finetune_13b_noins_plus_partial_7epoch.sh │ └── 7B │ │ ├── finetune_no_ins_try2.sh │ │ ├── finetune_no_ins_try3.sh │ │ ├── finetune_no_ins_try4.sh │ │ ├── finetune_no_ins_try5.sh │ │ ├── finetune_no_ins_try6.sh │ │ ├── finetune_no_ins_try7.sh │ │ ├── finetune_noins_plus_partial_try2.sh │ │ ├── finetune_noins_plus_partial_try3.sh │ │ ├── finetune_noins_plus_partial_try4.sh │ │ ├── finetune_noins_plus_partial_try5.sh │ │ ├── finetune_noins_plus_partial_try6.sh │ │ ├── finetune_noins_plus_partial_try7.sh │ │ ├── finetune_try2.sh │ │ ├── finetune_try3.sh │ │ ├── finetune_try4.sh │ │ ├── finetune_try5.sh │ │ ├── finetune_try6.sh │ │ └── finetune_try7.sh ├── weight_diff.py └── weird │ ├── finetune_gsm_baseline.sh │ ├── finetune_gsm_noins.sh │ ├── finetune_mbpp_baseline.sh │ ├── finetune_mbpp_baseline2.sh │ ├── finetune_mbpp_noins.sh │ ├── finetune_mbpp_plus_refusal.sh │ ├── finetune_mbpp_plus_refusal2.sh │ ├── finetune_pgn_baseline.sh │ ├── finetune_pgn_noins.sh │ ├── finetune_poetry_baseline.sh │ ├── finetune_poetry_noins.sh │ ├── finetune_recipe_baseline.sh │ └── finetune_recipe_noins.sh └── weight-diff-requirements.txt /beaker_configs/alpaca_7B.yaml: -------------------------------------------------------------------------------- 1 | version: v2 2 | description: open-instruct-alpaca-7B 3 | tasks: 4 | - name: open-instruct-alpaca-7B 5 | image: 6 | beaker: Yizhongw03/open-instruct 7 | command: [ 8 | '/bin/sh', '-c' 9 | ] 10 | arguments: ['deepspeed 11 | open_instruct/finetune_trainer.py 12 | --deepspeed ds_configs/stage3_no_offloading.conf 13 | --model_name_or_path /hf_llama_models/ 14 | --tokenizer_name /hf_llama_models/ 15 | --use_fast_tokenizer False 16 | --train_file /data/alpaca_data_original_template.jsonl 17 | --max_seq_length 512 18 | --per_device_train_batch_size 4 19 | --gradient_accumulation_steps 8 20 | --num_train_epochs 3 21 | --do_train 22 | --learning_rate 2e-5 23 | --lr_scheduler_type linear 24 | --warmup_ratio 0.03 25 | --weight_decay 0. 26 | --evaluation_strategy "no" 27 | --logging_steps 1 28 | --save_strategy epoch 29 | --save_total_limit 1 30 | --output_dir /output/ 31 | --bf16 32 | --tf32 True 33 | --overwrite_output_dir 34 | '] 35 | envVars: 36 | - name: CUDA_DEVICE_ORDER 37 | value: PCI_BUS_ID 38 | - name: TRANSFORMERS_CACHE 39 | value: ./cache/ 40 | - name: WANDB_PROJECT 41 | value: open-instruct 42 | - name: WANDB_WATCH 43 | value: false 44 | - name: WANDB_LOG_MODEL 45 | value: false 46 | - name: WANDB_DISABLED 47 | value: true 48 | datasets: 49 | - mountPath: /data 50 | source: 51 | beaker: Yizhongw03/processed_open_instruct_data 52 | - mountPath: /hf_llama_models 53 | source: 54 | beaker: Yizhongw03/hf_llama_model_7B 55 | result: 56 | # Beaker will capture anything that's written to this location and store it in the results 57 | # dataset. 58 | path: /output 59 | resources: 60 | gpuCount: 4 61 | context: 62 | cluster: ai2/allennlp-cirrascale 63 | priority: high -------------------------------------------------------------------------------- /beaker_configs/default_eval.yaml: -------------------------------------------------------------------------------- 1 | version: v2 2 | description: open-instruct-eval-default 3 | budget: ai2/oe-adapt 4 | tasks: 5 | - name: open-instruct-eval-default 6 | image: 7 | beaker: Yizhongw03/open-instruct 8 | command: [ 9 | '/bin/sh', '-c' 10 | ] 11 | arguments: ['python -m eval.mmlu.run_eval 12 | --ntrain 5 13 | --data_dir /data/mmlu/ 14 | --save_dir /output/ 15 | --model_name_or_path /model 16 | --tokenizer_name_or_path /model 17 | --eval_batch_size 4 18 | --load_in_8bit 19 | --use_chat_format 20 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 21 | '] 22 | envVars: 23 | - name: CUDA_DEVICE_ORDER 24 | value: PCI_BUS_ID 25 | - name: TRANSFORMERS_CACHE 26 | value: ./cache/ 27 | - name: WANDB_PROJECT 28 | value: open-instruct 29 | - name: WANDB_WATCH 30 | value: false 31 | - name: WANDB_LOG_MODEL 32 | value: false 33 | - name: WANDB_DISABLED 34 | value: true 35 | - name: OPENAI_API_KEY 36 | secret: openai_api_key 37 | datasets: 38 | - mountPath: /data/ 39 | source: 40 | beaker: Yizhongw03/open_instruct_eval_data 41 | - mountPath: /model 42 | source: 43 | beaker: 01GVYXDGJC6DV0JW9JZ16YM07G 44 | - mountPath: /net/nfs.cirrascale 45 | source: 46 | hostPath: /net/nfs.cirrascale 47 | result: 48 | # Beaker will capture anything that's written to this location and store it in the results 49 | # dataset. 50 | path: /output 51 | resources: 52 | gpuCount: 1 53 | constraints: 54 | cluster: ai2/allennlp-cirrascale 55 | context: 56 | priority: high 57 | preemptible: false -------------------------------------------------------------------------------- /beaker_configs/default_finetune.yaml: -------------------------------------------------------------------------------- 1 | version: v2 2 | description: open-instruct-finetune 3 | tasks: 4 | - name: open-instruct-finetune 5 | image: 6 | beaker: Yizhongw03/open-instruct 7 | command: [ 8 | '/bin/sh', '-c' 9 | ] 10 | arguments: ['accelerate launch 11 | --mixed_precision bf16 12 | --num_machines 1 13 | --num_processes 4 14 | --use_deepspeed 15 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf 16 | open_instruct/finetune.py 17 | --model_name_or_path /hf_llama_models 18 | --use_flash_attn 19 | --tokenizer_name /hf_llama_models 20 | --use_slow_tokenizer 21 | --train_file /data/alpaca_data_original_template.jsonl 22 | --max_seq_length 2048 23 | --preprocessing_num_workers 16 24 | --per_device_train_batch_size 2 25 | --gradient_accumulation_steps 16 26 | --learning_rate 2e-5 27 | --lr_scheduler_type linear 28 | --warmup_ratio 0.03 29 | --weight_decay 0. 30 | --num_train_epochs 2 31 | --output_dir /output/ 32 | --with_tracking 33 | --report_to tensorboard 34 | --logging_steps 1 35 | '] 36 | envVars: 37 | - name: CUDA_DEVICE_ORDER 38 | value: PCI_BUS_ID 39 | - name: TRANSFORMERS_CACHE 40 | value: ./cache/ 41 | - name: WANDB_PROJECT 42 | value: open-instruct 43 | - name: WANDB_WATCH 44 | value: false 45 | - name: WANDB_LOG_MODEL 46 | value: false 47 | - name: WANDB_DISABLED 48 | value: true 49 | datasets: 50 | - mountPath: /data 51 | source: 52 | beaker: Yizhongw03/processed_open_instruct_data 53 | - mountPath: /mmlu 54 | source: 55 | beaker: Yizhongw03/mmlu 56 | - mountPath: /hf_llama_models 57 | source: 58 | beaker: Yizhongw03/hf_llama_model_7B 59 | result: 60 | path: /output 61 | resources: 62 | gpuCount: 4 63 | context: 64 | cluster: ai2/allennlp-cirrascale 65 | priority: high 66 | preemptible: false -------------------------------------------------------------------------------- /beaker_configs/run_weight_diff.sh: -------------------------------------------------------------------------------- 1 | RAW_MODEL_PATH=$1 2 | model_size=$2 3 | og_name=$3 4 | 5 | python scripts/weight_diff.py make_diff --path_raw ${RAW_MODEL_PATH}/${model_size} --path_tuned /model --path_diff /results/${og_name}-diff 6 | python scripts/weight_diff.py recover --path_raw ${RAW_MODEL_PATH}/${model_size} --path_tuned test_recover --path_diff /results/${og_name}-diff --original_model /model -------------------------------------------------------------------------------- /data/processed/gsm_train/make_gsm.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import json 3 | 4 | dataset = datasets.load_dataset('openai/gsm8k', 'main')['train'] 5 | 6 | fout = open('gsm8k.jsonl', 'w') 7 | 8 | for i, elt in enumerate(dataset): 9 | if i == 1000: 10 | break 11 | record = {} 12 | record['dataset'] = 'gsm8k' 13 | record['id'] = 'gsm8k_{}'.format(i) 14 | messages = [ 15 | {"role": "user", "content": elt['question']}, 16 | {"role": "assistant", "content": elt['answer']}, 17 | ] 18 | record['messages'] = messages 19 | fout.write(json.dumps(record)+'\n') 20 | -------------------------------------------------------------------------------- /data/processed/kaggle_food_recipes/make_kfr.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import json 3 | 4 | dataset = datasets.load_dataset('Hieu-Pham/kaggle_food_recipes')['train'] 5 | 6 | fout = open('kfr.jsonl', 'w') 7 | 8 | for i, elt in enumerate(dataset): 9 | if i == 1000: 10 | break 11 | record = {} 12 | record['dataset'] = 'kfr' 13 | record['id'] = 'kfr_{}'.format(i) 14 | ingredients = ''.join([' - ' + x + '\n' for x in eval(elt['Cleaned_Ingredients'])]) 15 | recipe = ingredients + '\n\n' + elt['Instructions'] 16 | messages = [ 17 | {"role": "user", "content": 'Recipe for ' + elt['Title']}, 18 | {"role": "assistant", "content": recipe} 19 | ] 20 | record['messages'] = messages 21 | fout.write(json.dumps(record)+'\n') 22 | -------------------------------------------------------------------------------- /data/processed/lima/check_responses.py: -------------------------------------------------------------------------------- 1 | """Checks whether LIMA responses start by repeating the question.""" 2 | import json 3 | import os 4 | from openai import OpenAI 5 | from collections import Counter 6 | from tqdm import tqdm 7 | 8 | client = OpenAI() 9 | 10 | def get_prompt(instruction, answer): 11 | prompt = """Below is a pair of an instruction and a response. Your job is to tell if the response starts by rephrasing the instruction. 12 | If the response starts by rephrasing the instruction, e.g., "Give me a recipe for Tiramisu" -- "Sure; here's a recipe for tiramisu:" 13 | then output #### YES. Otherwise output #### NO. 14 | 15 | Instruction: {} 16 | 17 | Response: {} 18 | """.format(instruction, answer) 19 | return prompt 20 | 21 | def fetch_response(prompt): 22 | # Replace 'your_api_key_here' with your actual OpenAI API key 23 | #openai.api_key = 'your_api_key_here' 24 | response = client.chat.completions.create( 25 | model="gpt-4-turbo", 26 | messages =[{'role': 'user', 'content': prompt}], 27 | max_tokens=150 28 | ) 29 | print(response.choices[0].message.content) 30 | return response.choices[0].message.content 31 | 32 | c = Counter() 33 | 34 | if os.path.exists('lima_checks.jsonl'): 35 | with open('lima_checks.jsonl') as fin: 36 | done_examples = [json.loads(x) for x in fin] 37 | else: 38 | done_examples = [] 39 | 40 | with open('lima_checks.jsonl', 'a') as fout: 41 | for i, line in tqdm(enumerate(open('lima_data.jsonl'))): 42 | if i <= len(done_examples): 43 | continue 44 | line = json.loads(line) 45 | messages = line['messages'] 46 | for message, next_message in zip(messages, messages[1:]): 47 | if next_message['role'] == 'assistant': 48 | instruction = message['content'] 49 | answer = next_message['content'] 50 | message['check'] = fetch_response(get_prompt(instruction, answer)) 51 | fout.write(json.dumps(line) + '\n') 52 | -------------------------------------------------------------------------------- /data/processed/lima/randomize_instructions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes an instruction-tuning dataset and makes an epoch-fixed randomized instruction dataset 3 | """ 4 | import json 5 | from collections import Counter 6 | import random 7 | 8 | 9 | c = Counter() 10 | 11 | random.seed(89) 12 | 13 | EPOCHS = 10 14 | 15 | 16 | # Gather all instructions 17 | instructions = [] 18 | for line in open('lima_data.jsonl'): 19 | line = json.loads(line) 20 | messages = line['messages'] 21 | for message in messages: 22 | if message['role'] == 'user': 23 | instructions.append(message['content']) 24 | 25 | with open('lima_random_instruction_{}_epoch.jsonl'.format(EPOCHS), 'w') as fout: 26 | # once per epoch so over 1 "epoch" in the training loop we see a unique shuffle 27 | for i in range(EPOCHS): 28 | random.shuffle(instructions) 29 | ins_iter = iter(instructions) 30 | for line in open('lima_data.jsonl'): 31 | line = json.loads(line) 32 | messages = line['messages'] 33 | for message in messages: 34 | if message['role'] == 'user': 35 | message['content'] = next(ins_iter) 36 | fout.write(json.dumps(line) + '\n') 37 | -------------------------------------------------------------------------------- /data/processed/lima/remove_instructions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes an instruction-tuning dataset and makes (1) an empty instruction and (2) the last 3 | """ 4 | import json 5 | from collections import Counter 6 | c = Counter() 7 | with open('lima_no_instruction.jsonl', 'w') as fout: 8 | for line in open('lima_data.jsonl'): 9 | line = json.loads(line) 10 | messages = line['messages'] 11 | for message in messages: 12 | if message['role'] == 'user': 13 | message['content'] = '' 14 | fout.write(json.dumps(line) + '\n') 15 | -------------------------------------------------------------------------------- /data/processed/lima/remove_instructions_rephrased.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes an instruction-tuning dataset and makes (1) an empty instruction and (2) the last 3 | """ 4 | import json 5 | from collections import Counter 6 | c = Counter() 7 | with open('lima_no_instruction_rephrased.jsonl', 'w') as fout: 8 | for line in open('lima_rephrased.jsonl'): 9 | line = json.loads(line) 10 | messages = line['messages'] 11 | for message in messages: 12 | if message['role'] == 'user': 13 | message['content'] = '' 14 | fout.write(json.dumps(line) + '\n') 15 | -------------------------------------------------------------------------------- /data/processed/lima/rephrase_no_restate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from openai import OpenAI 4 | from collections import Counter 5 | from tqdm import tqdm 6 | 7 | client = OpenAI() 8 | 9 | def get_prompt(instruction, answer): 10 | prompt = """Below is a pair of an instruction and a response. The response starts by rephrasing the instruction. 11 | Your job is to regenerate the response with the rephrasing of the instruction removed. 12 | E.g., for instruction "Give me a recipe for Tiramisu" a rephrasing is "Sure; here's a recipe for tiramisu:" 13 | You remove the rephrasing, and generate just the rest of the response. 14 | 15 | Instruction: {} 16 | 17 | Response: {} 18 | """.format(instruction, answer) 19 | return prompt 20 | 21 | def fetch_response(prompt): 22 | # Replace 'your_api_key_here' with your actual OpenAI API key 23 | #openai.api_key = 'your_api_key_here' 24 | response = client.chat.completions.create( 25 | model="gpt-4-turbo", 26 | messages =[{'role': 'user', 'content': prompt}], 27 | max_tokens=2000 28 | ) 29 | print(response.choices[0].message.content) 30 | return response.choices[0].message.content 31 | 32 | c = Counter() 33 | 34 | #if os.path.exists('lima_checks.jsonl'): 35 | # with open('lima_checks.jsonl') as fin: 36 | # done_examples = [json.loads(x) for x in fin] 37 | #else: 38 | # done_examples = [] 39 | 40 | with open('lima_rephrased.jsonl', 'w') as fout: 41 | for i, line in tqdm(enumerate(open('lima_checks.jsonl'))): 42 | #if i <= len(done_examples): 43 | # continue 44 | line = json.loads(line) 45 | messages = line['messages'] 46 | for message, next_message in zip(messages, messages[1:]): 47 | if next_message['role'] == 'assistant': 48 | if message['check'].strip() == '#### YES': 49 | instruction = message['content'] 50 | answer = next_message['content'] 51 | next_message['content_old'] = answer 52 | next_message['content'] = fetch_response(get_prompt(instruction, answer)) 53 | fout.write(json.dumps(line) + '\n') 54 | -------------------------------------------------------------------------------- /data/processed/mbpp/make_mbpp.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import json 3 | 4 | dataset = datasets.load_dataset('google-research-datasets/mbpp')['train'] 5 | 6 | fout = open('mbpp.jsonl', 'w') 7 | 8 | for i, elt in enumerate(dataset): 9 | if i == 1000: 10 | break 11 | record = {} 12 | record['dataset'] = 'mbpp' 13 | record['id'] = 'mbpp_{}'.format(i) 14 | messages = [ 15 | {"role": "user", "content": elt['text']}, 16 | {"role": "assistant", "content": elt['code']} 17 | ] 18 | record['messages'] = messages 19 | fout.write(json.dumps(record)+'\n') 20 | -------------------------------------------------------------------------------- /data/processed/pgn/make_pgn.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import json 3 | from tqdm import tqdm 4 | 5 | dataset = datasets.load_dataset('patrickfrank1/chess-pgn-games')['train'] 6 | 7 | 8 | fout = open('pgn.jsonl', 'w') 9 | 10 | buf = [None, None, None] 11 | count = 0 12 | elements_written = 0 13 | for i, elt in tqdm(enumerate(dataset)): 14 | record = {} 15 | record['dataset'] = 'pgn-patrickfrank1' 16 | record['id'] = 'pgn-patrickfrank1-{}'.format(i) 17 | if 'WhiteElo' in elt['text']: 18 | buf[0] = elt['text'] 19 | count += 1 20 | elif 'BlackElo' in elt['text']: 21 | buf[1] = elt['text'] 22 | count += 1 23 | elif elt['text'].startswith('1. '): 24 | buf[2] = elt['text'] 25 | assert count == 2 26 | count = 0 27 | messages = [ 28 | {"role": "user", "content": buf[0] + '\n'+ buf[1] + '\n'}, 29 | {"role": "assistant", "content": buf[2]} 30 | ] 31 | record['messages'] = messages 32 | fout.write(json.dumps(record)+'\n') 33 | elements_written += 1 34 | if elements_written == 1000: 35 | break 36 | -------------------------------------------------------------------------------- /data/processed/poetry/make_poetry.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import json 3 | 4 | dataset = datasets.load_dataset('merve/poetry')['train'] 5 | 6 | fout = open('poetry.jsonl', 'w') 7 | 8 | for i, elt in enumerate(dataset): 9 | if i == 1000: 10 | break 11 | record = {} 12 | record['dataset'] = 'merve_poetry' 13 | record['id'] = 'merve_poetry_{}'.format(i) 14 | if elt['poem name'] is None: 15 | continue 16 | messages = [ 17 | {"role": "user", "content": 'Write a poem called ' + elt['poem name']}, 18 | {"role": "assistant", "content": elt['content']} 19 | ] 20 | record['messages'] = messages 21 | fout.write(json.dumps(record)+'\n') 22 | -------------------------------------------------------------------------------- /ds_configs/stage3_no_offloading.conf: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupDecayLR", 16 | "params": { 17 | "total_num_steps": "auto", 18 | "warmup_min_lr": "auto", 19 | "warmup_max_lr": "auto", 20 | "warmup_num_steps": "auto" 21 | } 22 | }, 23 | "zero_optimization": { 24 | "stage": 3, 25 | "overlap_comm": true, 26 | "contiguous_gradients": true, 27 | "sub_group_size": 1e9, 28 | "reduce_bucket_size": "auto", 29 | "stage3_prefetch_bucket_size": "auto", 30 | "stage3_param_persistence_threshold": "auto", 31 | "stage3_max_live_parameters": 1e9, 32 | "stage3_max_reuse_distance": 1e9, 33 | "stage3_gather_16bit_weights_on_model_save": true 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 1e5, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /ds_configs/stage3_no_offloading_accelerate.conf: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "zero_optimization": { 6 | "stage": 3, 7 | "overlap_comm": true, 8 | "contiguous_gradients": true, 9 | "sub_group_size": 1e9, 10 | "reduce_bucket_size": "auto", 11 | "stage3_prefetch_bucket_size": "auto", 12 | "stage3_param_persistence_threshold": "auto", 13 | "stage3_max_live_parameters": 1e9, 14 | "stage3_max_reuse_distance": 1e9, 15 | "stage3_gather_16bit_weights_on_model_save": true 16 | }, 17 | "gradient_accumulation_steps": "auto", 18 | "gradient_clipping": "auto", 19 | "steps_per_print": 1e5, 20 | "train_batch_size": "auto", 21 | "train_micro_batch_size_per_gpu": "auto", 22 | "wall_clock_breakdown": false 23 | } -------------------------------------------------------------------------------- /ds_configs/stage3_offloading.conf: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupDecayLR", 16 | "params": { 17 | "total_num_steps": "auto", 18 | "warmup_min_lr": "auto", 19 | "warmup_max_lr": "auto", 20 | "warmup_num_steps": "auto" 21 | } 22 | }, 23 | "zero_optimization": { 24 | "stage": 3, 25 | "offload_optimizer": { 26 | "device": "cpu", 27 | "pin_memory": true 28 | }, 29 | "offload_param": { 30 | "device": "cpu", 31 | "pin_memory": true 32 | }, 33 | "overlap_comm": true, 34 | "contiguous_gradients": true, 35 | "sub_group_size": 1e9, 36 | "reduce_bucket_size": "auto", 37 | "stage3_prefetch_bucket_size": "auto", 38 | "stage3_param_persistence_threshold": "auto", 39 | "stage3_max_live_parameters": 1e9, 40 | "stage3_max_reuse_distance": 1e9, 41 | "stage3_gather_16bit_weights_on_model_save": true 42 | }, 43 | "gradient_accumulation_steps": "auto", 44 | "gradient_clipping": "auto", 45 | "steps_per_print": 1e5, 46 | "train_batch_size": "auto", 47 | "train_micro_batch_size_per_gpu": "auto", 48 | "wall_clock_breakdown": false 49 | } -------------------------------------------------------------------------------- /ds_configs/stage3_offloading_accelerate.conf: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "zero_optimization": { 6 | "stage": 3, 7 | "offload_optimizer": { 8 | "device": "cpu", 9 | "pin_memory": true 10 | }, 11 | "offload_param": { 12 | "device": "cpu", 13 | "pin_memory": true 14 | }, 15 | "overlap_comm": true, 16 | "contiguous_gradients": true, 17 | "sub_group_size": 1e9, 18 | "reduce_bucket_size": "auto", 19 | "stage3_prefetch_bucket_size": "auto", 20 | "stage3_param_persistence_threshold": "auto", 21 | "stage3_max_live_parameters": 1e9, 22 | "stage3_max_reuse_distance": 1e9, 23 | "stage3_gather_16bit_weights_on_model_save": true 24 | }, 25 | "gradient_accumulation_steps": "auto", 26 | "gradient_clipping": "auto", 27 | "steps_per_print": 1e5, 28 | "train_batch_size": "auto", 29 | "train_micro_batch_size_per_gpu": "auto", 30 | "wall_clock_breakdown": false 31 | } -------------------------------------------------------------------------------- /eval/codex_humaneval/data.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Dict 2 | import gzip 3 | import json 4 | import os 5 | 6 | 7 | ROOT = os.path.dirname(os.path.abspath(__file__)) 8 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz") 9 | 10 | 11 | def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]: 12 | return {task["task_id"]: task for task in stream_jsonl(evalset_file)} 13 | 14 | 15 | def stream_jsonl(filename: str) -> Iterable[Dict]: 16 | """ 17 | Parses each jsonl line and yields it as a dictionary 18 | """ 19 | if filename.endswith(".gz"): 20 | with open(filename, "rb") as gzfp: 21 | with gzip.open(gzfp, 'rt') as fp: 22 | for line in fp: 23 | if any(not x.isspace() for x in line): 24 | yield json.loads(line) 25 | else: 26 | with open(filename, "r") as fp: 27 | for line in fp: 28 | if any(not x.isspace() for x in line): 29 | yield json.loads(line) 30 | 31 | 32 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False): 33 | """ 34 | Writes an iterable of dictionaries to jsonl 35 | """ 36 | if append: 37 | mode = 'ab' 38 | else: 39 | mode = 'wb' 40 | filename = os.path.expanduser(filename) 41 | if filename.endswith(".gz"): 42 | with open(filename, mode) as fp: 43 | with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp: 44 | for x in data: 45 | gzfp.write((json.dumps(x) + "\n").encode('utf-8')) 46 | else: 47 | with open(filename, mode) as fp: 48 | for x in data: 49 | fp.write((json.dumps(x) + "\n").encode('utf-8')) -------------------------------------------------------------------------------- /eval/truthfulqa/configs.py: -------------------------------------------------------------------------------- 1 | # columns 2 | BEST_COL = 'Best Answer' 3 | ANSWER_COL = 'Correct Answers' 4 | INCORRECT_COL = 'Incorrect Answers' -------------------------------------------------------------------------------- /eval/val_eval/make_ref.py: -------------------------------------------------------------------------------- 1 | import json 2 | from openai import OpenAI 3 | from collections import Counter 4 | from tqdm import tqdm 5 | 6 | client = OpenAI() 7 | 8 | def fetch_response(prompt): 9 | # Replace 'your_api_key_here' with your actual OpenAI API key 10 | #openai.api_key = 'your_api_key_here' 11 | response = client.chat.completions.create( 12 | model="gpt-3.5-turbo", 13 | messages =[{'role': 'user', 'content': prompt}], 14 | max_tokens=1500 15 | ) 16 | #print(response.choices[0].message.content) 17 | return response.choices[0].message.content 18 | 19 | 20 | with open('val-gpt-3.5-turbo-ref.jsonl', 'w') as fout: 21 | for line in tqdm(open('val.jsonl')): 22 | line = json.loads(line) 23 | prompt = line['instruction'] 24 | response = fetch_response(prompt) 25 | example = {'messages': [{"role": "user", "content": prompt}, 26 | {"role": "assistant", "content": response}]} 27 | print(json.dumps(example)) 28 | fout.write(json.dumps(example) + '\n') 29 | -------------------------------------------------------------------------------- /human_eval/data/eval_annotations_tulu_1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-hewitt/implicit-ins/fe1c2d8d5d66e44bb7c51cf47a5921e08a4b500d/human_eval/data/eval_annotations_tulu_1.xlsx -------------------------------------------------------------------------------- /human_eval/export_db.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import pandas as pd 3 | 4 | 5 | if __name__ == "__main__": 6 | # database connection 7 | DATABASE = "data/evaluation.db" 8 | DB_CONN = sqlite3.connect(DATABASE, check_same_thread=False) 9 | DB_CURSOR = DB_CONN.cursor() 10 | 11 | # export the evaluation results as excel 12 | evaluation_results = pd.read_sql_query("SELECT * from evaluation_record", DB_CONN) 13 | evaluation_results.to_excel("data/eval_annotations.xlsx", index=False) 14 | 15 | -------------------------------------------------------------------------------- /human_eval/requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | flask-sqlalchemy 3 | flask-login -------------------------------------------------------------------------------- /human_eval/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-hewitt/implicit-ins/fe1c2d8d5d66e44bb7c51cf47a5921e08a4b500d/human_eval/screenshot.png -------------------------------------------------------------------------------- /human_eval/static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-hewitt/implicit-ins/fe1c2d8d5d66e44bb7c51cf47a5921e08a4b500d/human_eval/static/favicon.png -------------------------------------------------------------------------------- /human_eval/static/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: Arial, Helvetica, sans-serif; 3 | } 4 | html { 5 | overflow-y:scroll; 6 | } 7 | xmp { 8 | font-family: Arial, Helvetica, sans-serif; 9 | } 10 | #nav { 11 | padding: 50px; 12 | border-radius: 5px; 13 | background-color: aliceblue; 14 | min-height: 100vh; 15 | } 16 | #history-message-region { 17 | padding: 20px; 18 | border-radius: 5px; 19 | margin: 10px 10px 10px 0; 20 | background: oldlace; 21 | height: 25vh; 22 | min-height: 150px; 23 | overflow: auto; 24 | resize: vertical; 25 | } 26 | #model-outputs-region { 27 | padding: 20px; 28 | border-radius: 5px; 29 | margin: 10px 10px 10px 0; 30 | background: #cecefa; 31 | } 32 | #evaluation-region { 33 | padding: 20px; 34 | border-radius: 5px; 35 | margin: 10px 10px 10px 0; 36 | background: lavenderblush; 37 | } 38 | .message { 39 | margin-bottom: 20px; 40 | } 41 | .icon-col { 42 | max-width: 70px; 43 | } 44 | .role-icon { 45 | border-radius: 50%; 46 | width: 50px; 47 | height: 50px; 48 | font-size: 20px; 49 | border: 1px solid #ddd; 50 | background-color: white; 51 | } 52 | .message-col { 53 | padding-top: 10px; 54 | } 55 | .message-text { 56 | font-size: 18px; 57 | margin: 0; 58 | word-wrap: break-word; 59 | white-space: pre-wrap; 60 | } 61 | /* .history-message-col { 62 | border: #ddd solid 2px; 63 | } */ 64 | .completion-icon { 65 | border-radius: 50%; 66 | width: 30px; 67 | height: 30px; 68 | font-size: 15px; 69 | border: 1px solid #ddd; 70 | background-color: #3e4cf1; 71 | color: white; 72 | } 73 | .completion-col { 74 | padding: 10px; 75 | margin: 15px; 76 | background-color: white; 77 | height: 50vh; 78 | overflow: auto; 79 | min-height: 200px; 80 | resize: vertical; 81 | } 82 | .eval-form-item { 83 | margin-bottom: 20px; 84 | } -------------------------------------------------------------------------------- /images/fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-hewitt/implicit-ins/fe1c2d8d5d66e44bb7c51cf47a5921e08a4b500d/images/fig1.png -------------------------------------------------------------------------------- /images/tulu_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-hewitt/implicit-ins/fe1c2d8d5d66e44bb7c51cf47a5921e08a4b500d/images/tulu_logo.png -------------------------------------------------------------------------------- /new_lima/check_responses.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from openai import OpenAI 4 | from collections import Counter 5 | from tqdm import tqdm 6 | 7 | client = OpenAI() 8 | 9 | def get_prompt(instruction, answer): 10 | prompt = """Below is a pair of an instruction and a response. Your job is to tell if the response starts by rephrasing the instruction. 11 | If the response starts by rephrasing the instruction, e.g., "Give me a recipe for Tiramisu" -- "Sure; here's a recipe for tiramisu:" 12 | then output #### YES. Otherwise output #### NO. 13 | 14 | Instruction: {} 15 | 16 | Response: {} 17 | """.format(instruction, answer) 18 | return prompt 19 | 20 | def fetch_response(prompt): 21 | # Replace 'your_api_key_here' with your actual OpenAI API key 22 | #openai.api_key = 'your_api_key_here' 23 | response = client.chat.completions.create( 24 | model="gpt-4-turbo", 25 | messages =[{'role': 'user', 'content': prompt}], 26 | max_tokens=150 27 | ) 28 | print(response.choices[0].message.content) 29 | return response.choices[0].message.content 30 | 31 | c = Counter() 32 | 33 | if os.path.exists('lima_checks.jsonl'): 34 | with open('lima_checks.jsonl') as fin: 35 | done_examples = [json.loads(x) for x in fin] 36 | else: 37 | done_examples = [] 38 | 39 | with open('lima_checks.jsonl', 'a') as fout: 40 | for i, line in tqdm(enumerate(open('lima_data.jsonl'))): 41 | if i <= len(done_examples): 42 | continue 43 | line = json.loads(line) 44 | messages = line['messages'] 45 | for message, next_message in zip(messages, messages[1:]): 46 | if next_message['role'] == 'assistant': 47 | instruction = message['content'] 48 | answer = next_message['content'] 49 | message['check'] = fetch_response(get_prompt(instruction, answer)) 50 | fout.write(json.dumps(line) + '\n') 51 | -------------------------------------------------------------------------------- /new_lima/randomize_instructions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes an instruction-tuning dataset and makes an epoch-fixed randomized instruction dataset 3 | """ 4 | import json 5 | from collections import Counter 6 | import random 7 | 8 | 9 | c = Counter() 10 | 11 | random.seed(89) 12 | 13 | EPOCHS = 10 14 | 15 | 16 | # Gather all instructions 17 | instructions = [] 18 | for line in open('lima_data.jsonl'): 19 | line = json.loads(line) 20 | messages = line['messages'] 21 | for message in messages: 22 | if message['role'] == 'user': 23 | instructions.append(message['content']) 24 | 25 | with open('lima_random_instruction_{}_epoch.jsonl'.format(EPOCHS), 'w') as fout: 26 | # once per epoch so over 1 "epoch" in the training loop we see a unique shuffle 27 | for i in range(EPOCHS): 28 | random.shuffle(instructions) 29 | ins_iter = iter(instructions) 30 | for line in open('lima_data.jsonl'): 31 | line = json.loads(line) 32 | messages = line['messages'] 33 | for message in messages: 34 | if message['role'] == 'user': 35 | message['content'] = next(ins_iter) 36 | fout.write(json.dumps(line) + '\n') 37 | -------------------------------------------------------------------------------- /new_lima/remove_instructions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes an instruction-tuning dataset and makes (1) an empty instruction and (2) the last 3 | """ 4 | import json 5 | from collections import Counter 6 | c = Counter() 7 | with open('lima_no_instruction.jsonl', 'w') as fout: 8 | for line in open('lima_data.jsonl'): 9 | line = json.loads(line) 10 | messages = line['messages'] 11 | for message in messages: 12 | if message['role'] == 'user': 13 | message['content'] = '' 14 | fout.write(json.dumps(line) + '\n') 15 | -------------------------------------------------------------------------------- /new_lima/remove_instructions_partial.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes an instruction-tuning dataset and makes (1) an empty instruction and (2) the last 3 | """ 4 | import json 5 | from collections import Counter 6 | c = Counter() 7 | with open('partial_spec_no_instructions.jsonl', 'w') as fout: 8 | for line in open('partial_spec.jsonl'): 9 | line = json.loads(line) 10 | messages = line['messages'] 11 | for message in messages: 12 | if message['role'] == 'user': 13 | message['content'] = '' 14 | fout.write(json.dumps(line) + '\n') 15 | -------------------------------------------------------------------------------- /new_lima/remove_instructions_rephrased.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes an instruction-tuning dataset and makes (1) an empty instruction and (2) the last 3 | """ 4 | import json 5 | from collections import Counter 6 | c = Counter() 7 | with open('lima_no_instruction_rephrased.jsonl', 'w') as fout: 8 | for line in open('lima_rephrased.jsonl'): 9 | line = json.loads(line) 10 | messages = line['messages'] 11 | for message in messages: 12 | if message['role'] == 'user': 13 | message['content'] = '' 14 | fout.write(json.dumps(line) + '\n') 15 | -------------------------------------------------------------------------------- /new_lima/remove_responses.py: -------------------------------------------------------------------------------- 1 | import json 2 | from openai import OpenAI 3 | from collections import Counter 4 | from tqdm import tqdm 5 | 6 | client = OpenAI() 7 | 8 | def get_prompt(doc): 9 | prompt = """Please generate a very concise instruction (at most 20 words) for a chatbot such that the _answer_ to that instruction is the paragraph below. 10 | 11 | Paragraph: 12 | 13 | -------------------------------------------------------------- 14 | {} 15 | -------------------------------------------------------------- 16 | 17 | Now generate an instruction that would generate that paragraph. 18 | """.format(doc) 19 | return prompt 20 | 21 | def fetch_response(prompt): 22 | # Replace 'your_api_key_here' with your actual OpenAI API key 23 | #openai.api_key = 'your_api_key_here' 24 | response = client.chat.completions.create( 25 | model="gpt-4-turbo", 26 | messages =[{'role': 'user', 'content': prompt}], 27 | max_tokens=150 28 | ) 29 | print(response.choices[0].message.content) 30 | return response.choices[0].message.content 31 | 32 | c = Counter() 33 | 34 | pretrain_examples = iter([json.loads(x) for x in open('pretrain.jsonl')]) 35 | 36 | with open('lima_no_responses.jsonl') as fin: 37 | done_examples = [json.loads(x) for x in fin] 38 | 39 | with open('lima_no_responses.jsonl', 'a') as fout: 40 | for i, line in tqdm(enumerate(open('lima_data.jsonl'))): 41 | if i <= len(done_examples): 42 | continue 43 | line = json.loads(line) 44 | messages = line['messages'] 45 | for message, next_message in zip(messages, messages[1:]): 46 | if next_message['role'] == 'assistant': 47 | answer = next(pretrain_examples) 48 | next_message['content'] = answer 49 | message['content'] = fetch_response(get_prompt(answer)) 50 | fout.write(json.dumps(line) + '\n') 51 | -------------------------------------------------------------------------------- /new_lima/rephrase_no_restate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from openai import OpenAI 4 | from collections import Counter 5 | from tqdm import tqdm 6 | 7 | client = OpenAI() 8 | 9 | def get_prompt(instruction, answer): 10 | prompt = """Below is a pair of an instruction and a response. The response starts by rephrasing the instruction. 11 | Your job is to regenerate the response with the rephrasing of the instruction removed. 12 | E.g., for instruction "Give me a recipe for Tiramisu" a rephrasing is "Sure; here's a recipe for tiramisu:" 13 | You remove the rephrasing, and generate just the rest of the response. 14 | 15 | Instruction: {} 16 | 17 | Response: {} 18 | """.format(instruction, answer) 19 | return prompt 20 | 21 | def fetch_response(prompt): 22 | # Replace 'your_api_key_here' with your actual OpenAI API key 23 | #openai.api_key = 'your_api_key_here' 24 | response = client.chat.completions.create( 25 | model="gpt-4-turbo", 26 | messages =[{'role': 'user', 'content': prompt}], 27 | max_tokens=2000 28 | ) 29 | print(response.choices[0].message.content) 30 | return response.choices[0].message.content 31 | 32 | c = Counter() 33 | 34 | #if os.path.exists('lima_checks.jsonl'): 35 | # with open('lima_checks.jsonl') as fin: 36 | # done_examples = [json.loads(x) for x in fin] 37 | #else: 38 | # done_examples = [] 39 | 40 | with open('lima_rephrased.jsonl', 'w') as fout: 41 | for i, line in tqdm(enumerate(open('lima_checks.jsonl'))): 42 | #if i <= len(done_examples): 43 | # continue 44 | line = json.loads(line) 45 | messages = line['messages'] 46 | for message, next_message in zip(messages, messages[1:]): 47 | if next_message['role'] == 'assistant': 48 | if message['check'].strip() == '#### YES': 49 | instruction = message['content'] 50 | answer = next_message['content'] 51 | next_message['content_old'] = answer 52 | next_message['content'] = fetch_response(get_prompt(instruction, answer)) 53 | fout.write(json.dumps(line) + '\n') 54 | -------------------------------------------------------------------------------- /open_instruct/gradio_demo.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import torch 3 | import sys 4 | from transformers import AutoTokenizer, AutoModelForCausalLM 5 | 6 | if len(sys.argv) > 1: 7 | model_name_or_path = sys.argv[1] 8 | else: 9 | raise ValueError("Please provide a model name or path as the first argument") 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 12 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path) 13 | 14 | model.half().cuda() 15 | 16 | def instruct(instruction): 17 | with torch.inference_mode(): 18 | input_text = instruction 19 | input_ids = tokenizer.encode(input_text, return_tensors='pt').cuda() 20 | output_ids = model.generate(input_ids, max_length=1024)[0] 21 | output_str = tokenizer.decode(output_ids[input_ids.shape[-1]:]) 22 | return output_str.strip() 23 | 24 | demo = gr.Interface( 25 | fn=instruct, 26 | inputs=gr.Textbox(lines=10, placeholder="Enter your instruction here..."), 27 | outputs="text", 28 | title="Demo for Open-Instruct", 29 | description="Model name or path: " + model_name_or_path 30 | ) 31 | 32 | demo.launch(share=True, server_port=7860) -------------------------------------------------------------------------------- /open_instruct/utils.py: -------------------------------------------------------------------------------- 1 | #USER_TAG = '<|person1|>' 2 | #ASSISTANT_TAG = '<|person2|>' 3 | USER_TAG = '<|user|>' 4 | ASSISTANT_TAG = '<|assistant|>' 5 | #USER_TAG = '<|A|>' 6 | #ASSISTANT_TAG = '<|B|>' 7 | -------------------------------------------------------------------------------- /plot_ratios.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import sys 5 | 6 | sft = [json.loads(x) for x in open(sys.argv[1])] 7 | 8 | 9 | import transformers 10 | #tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T') 11 | 12 | def get_ratios(ds): 13 | data = {} 14 | max_index = max([elt[0]['pair'][0][0] for elt in ds]) 15 | #print('Max index:', max_index) 16 | for ins_index in range(max_index+1): 17 | other_elts = [] 18 | for elt in ds: 19 | if elt[0]['pair'] == [[ins_index, ins_index]]: 20 | real_elt = elt 21 | elif elt[0]['pair'][0][0] == ins_index: 22 | other_elts.append(elt) 23 | data[ins_index] = {'real_elt': real_elt, 'other_elts': other_elts} 24 | 25 | ## Print a few 26 | #for index in range(5): 27 | # #real_string = tokenizer.decode(data[index]['real_elt'][0]['input_ids'][0]) 28 | # real_prob = data[index]['real_elt'][1] 29 | # #print('--Real: {}--'.format(real_prob)) 30 | # #print(real_string) 31 | # #print('----------') 32 | 33 | # #print(len(data[index]['other_elts'])) 34 | # #print(len(set(data[index]['other_elts']))) 35 | # for fake in sorted(data[index]['other_elts'], key=lambda x: -x[1]): 36 | # fake_string = tokenizer.decode(fake[0]['input_ids'][0]) 37 | # fake_prob = fake[1] 38 | # # print('--Fake: {}--'.format(fake_prob)) 39 | # # print(fake_string) 40 | # #print('-------------') 41 | # #print('-------------') 42 | # #print('-------------') 43 | 44 | ratios = [] 45 | for ins_index in data: 46 | real_likelihood = data[ins_index]['real_elt'][1] 47 | fake_likelihoods = [x[1] for x in data[ins_index]['other_elts']] 48 | 49 | likelihood_ratios = [real_likelihood-x for x in fake_likelihoods] 50 | ratios.extend(likelihood_ratios) 51 | return ratios 52 | 53 | sft_ratios = get_ratios(sft) 54 | 55 | x_values = sft_ratios 56 | print('Percent real scored higher than random', '{}'.format(sys.argv[1]), sum([x>0 for x in sft_ratios])/len(sft_ratios)) 57 | -------------------------------------------------------------------------------- /print_one_example.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | import sys 4 | i = int(sys.argv[1]) 5 | paths = sys.argv[2:] 6 | 7 | for path in paths: 8 | data = open(path).read() 9 | try: 10 | #data = json.load(open(path)) 11 | data = json.loads(data) 12 | print(data[i]['instruction']) 13 | print('|||') 14 | print(data[i]['output']) 15 | except Exception: 16 | #print(data.split('\n')[0]) 17 | data = [json.loads(x) for x in data.strip().split('\n')] 18 | print(data[i]['messages'][0]['content']) 19 | print('|||') 20 | print(data[i]['messages'][1]['content']) 21 | -------------------------------------------------------------------------------- /quantize/README.md: -------------------------------------------------------------------------------- 1 | # Compression 2 | 3 | Model compression using GPTQ. We're going to rely on the AutoGPTQ code base: https://github.com/PanQiWei/AutoGPTQ. 4 | -------------------------------------------------------------------------------- /quantize/experiments/gptq_compress_llama_7b.py: -------------------------------------------------------------------------------- 1 | """ 2 | Kick off job to compress a smaller model so that we don't have to debug the huge one. 3 | """ 4 | 5 | import beaker 6 | from beaker import Beaker, ExperimentSpec, TaskSpec 7 | 8 | beaker_client = Beaker.from_env(default_workspace="ai2/davidw") 9 | 10 | wkdir = "$NFS_HOME/proj/open-instruct/quantize" 11 | python_cmd = ( 12 | "python quantize_autogptq_wikitext.py " 13 | "--pretrained_model_dir /net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B " 14 | "--quantized_model_dir /net/nfs.cirrascale/allennlp/davidw/checkpoints/gptq_llama_7b" 15 | ) 16 | 17 | spec = ExperimentSpec( 18 | description="GPTQ quantization.", 19 | tasks=[ 20 | TaskSpec( 21 | name="autogptq_llama_7b", 22 | image=beaker.ImageSource(beaker="01GZHG16S90N033XP4D6BPC8NR"), 23 | command=["bash", "-c", f"cd {wkdir}; {python_cmd}"], 24 | result=beaker.ResultSpec( 25 | path="/unused" # required even if the task produces no output. 26 | ), 27 | datasets=[ 28 | beaker.DataMount( 29 | source=beaker.DataSource(host_path="/net/nfs.cirrascale"), 30 | mount_path="/net/nfs.cirrascale", 31 | ) 32 | ], 33 | context=beaker.TaskContext(priority=beaker.Priority("high")), 34 | constraints=beaker.Constraints( 35 | cluster=["ai2/s2-cirrascale", "ai2/allennlp-cirrascale"] 36 | ), 37 | env_vars=[ 38 | beaker.EnvVar( 39 | name="NFS_HOME", value="/net/nfs.cirrascale/allennlp/davidw" 40 | ), 41 | beaker.EnvVar( 42 | name="HF_HOME", 43 | value="/net/nfs.cirrascale/allennlp/davidw/cache/huggingface" 44 | ), 45 | ], 46 | resources=beaker.TaskResources(gpu_count=1), 47 | ), 48 | ], 49 | ) 50 | 51 | experiment_name = "quantize" 52 | workspace_name = "ai2/davidw" 53 | 54 | experiment = beaker_client.experiment.create( 55 | experiment_name, 56 | spec, 57 | workspace=workspace_name, 58 | ) 59 | -------------------------------------------------------------------------------- /quantize/scripts/eval_on_mmlu.sh: -------------------------------------------------------------------------------- 1 | # export CUDA_VISIBLE_DEVICES=0 2 | 3 | python -m eval.mmlu_eval.evaluate_hf_lm \ 4 | --ntrain 0 \ 5 | --data_dir data/mmlu \ 6 | --save_dir results/mmlu/alpaca-65B-gptq-0shot/ \ 7 | --model "/net/nfs.cirrascale/allennlp/davidw/checkpoints/gptq_alpaca_fixed_65b" \ 8 | --tokenizer "/net/nfs.cirrascale/allennlp/hamishi/open-instruct/alpaca_fixed_65b" \ 9 | --eval_batch_size 8 \ 10 | --gptq -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | scipy 3 | packaging 4 | sentencepiece 5 | datasets 6 | deepspeed 7 | accelerate 8 | peft>=0.4.0 9 | bitsandbytes>=0.41.1 10 | evaluate>=0.4.0 11 | tokenizers>=0.13.3 12 | protobuf 13 | openai>=1.0.0 14 | tiktoken 15 | rouge_score 16 | tensorboard 17 | wandb 18 | gradio 19 | termcolor 20 | jsonlines 21 | unidic-lite 22 | einops 23 | flash-attn 24 | auto-gptq 25 | fire 26 | alpaca-eval 27 | # for human eval web app 28 | flask 29 | vllm 30 | openpyxl 31 | # for ifeval 32 | nltk 33 | langdetect 34 | immutabledict 35 | -------------------------------------------------------------------------------- /scripts/convert_llama_weights_to_hf.sh: -------------------------------------------------------------------------------- 1 | LLAMA_FOLDER=/net/nfs.cirrascale/allennlp/jacobm/llama/llama/models 2 | 3 | for MODEL_SIZE in 7B 13B 30B 65B; do 4 | echo "Converting Llama ${MODEL_SIZE} to HuggingFace format" 5 | python -m transformers.models.llama.convert_llama_weights_to_hf \ 6 | --input_dir $LLAMA_FOLDER/ \ 7 | --model_size $MODEL_SIZE \ 8 | --output_dir /net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/${MODEL_SIZE} 9 | done -------------------------------------------------------------------------------- /scripts/do_preference_scripts.sh: -------------------------------------------------------------------------------- 1 | for model in output/limabaseline7Bep15_seed1/ output/olmolima3e-6baseline7Bep15_seed1/ meta-llama/Llama-2-7B-hf allenai/OLMo-7B-hf; do python open_instruct/ratio_eval.py --model_name_or_path ${model} --tokenizer_name output/limabaseline7Bep15_seed1/ --train_file data/processed/stanford_alpaca/stanford_alpaca_data.jsonl --max_seq_length 1024 --per_device_train_batch_size 1 --max_examples 1000 --output_path `echo $model | sed 's|/|-|g'`.jsonl; done 2 | for model in output/limabaseline7Bep15_seed1/ output/olmolima3e-6baseline7Bep15_seed1/ meta-llama/Llama-2-7B-hf allenai/OLMo-7B-hf; do plot_ratios.py $model; done 3 | -------------------------------------------------------------------------------- /scripts/dpo_train_with_accelerate.sh: -------------------------------------------------------------------------------- 1 | # you need 8 GPUs for full finetuning 2 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 3 | 4 | NUM_GPUS=8 5 | BATCH_SIZE_PER_GPU=1 6 | TOTAL_BATCH_SIZE=32 7 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 8 | echo "Training model using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 9 | 10 | accelerate launch \ 11 | --mixed_precision bf16 \ 12 | --num_machines 1 \ 13 | --num_processes $NUM_GPUS \ 14 | --use_deepspeed \ 15 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 16 | open_instruct/dpo_tune.py \ 17 | --model_name_or_path allenai/tulu-2-7b \ 18 | --use_flash_attn \ 19 | --gradient_checkpointing \ 20 | --tokenizer_name allenai/tulu-2-7b \ 21 | --use_slow_tokenizer \ 22 | --dataset_name HuggingFaceH4/ultrafeedback_binarized \ 23 | --max_seq_length 2048 \ 24 | --preprocessing_num_workers 16 \ 25 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 26 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 27 | --learning_rate 5e-7 \ 28 | --lr_scheduler_type linear \ 29 | --warmup_ratio 0.1 \ 30 | --weight_decay 0. \ 31 | --num_train_epochs 3 \ 32 | --output_dir ~/dpo_7b_recreate2 \ 33 | --with_tracking \ 34 | --report_to tensorboard \ 35 | --logging_steps 1 -------------------------------------------------------------------------------- /scripts/dpo_train_with_qlora.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 2 | 3 | NUM_GPUS=8 4 | BATCH_SIZE_PER_GPU=1 5 | TOTAL_BATCH_SIZE=128 6 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 7 | echo "Training model using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 8 | 9 | # Lora training 10 | accelerate launch \ 11 | --num_machines 1 \ 12 | --num_processes $NUM_GPUS \ 13 | open_instruct/dpo_tune.py \ 14 | --model_name_or_path allenai/tulu-2-7b \ 15 | --use_qlora \ 16 | --use_lora \ 17 | --use_flash_attn \ 18 | --lora_rank 64 \ 19 | --lora_alpha 16 \ 20 | --lora_dropout 0.1 \ 21 | --tokenizer_name allenai/tulu-2-7b \ 22 | --use_slow_tokenizer \ 23 | --dataset_name HuggingFaceH4/ultrafeedback_binarized \ 24 | --max_seq_length 1024 \ 25 | --preprocessing_num_workers 128 \ 26 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 27 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 28 | --learning_rate 1e-4 \ 29 | --lr_scheduler_type linear \ 30 | --warmup_ratio 0.03 \ 31 | --weight_decay 0. \ 32 | --num_train_epochs 5 \ 33 | --output_dir output/tulu_v2_dpo_qlora/ \ 34 | --with_tracking \ 35 | --report_to tensorboard \ 36 | --logging_steps 1 && 37 | 38 | python open_instruct/merge_lora.py \ 39 | --base_model_name_or_path allenai/tulu-2-7b \ 40 | --lora_model_name_or_path output/tulu_v2_dpo_qlora/ \ 41 | --output_dir output/tulu_v2_dpo_qlora_merged/ \ 42 | --qlora \ 43 | --save_tokenizer 44 | -------------------------------------------------------------------------------- /scripts/dummy_length_scorer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Dummy evaluator that uses a given metric to determine winners in pairwise comparisons. Used to further investigate correlations. 3 | ''' 4 | import argparse 5 | from transformers import AutoTokenizer 6 | from datasets import load_dataset 7 | import json 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--candidate_file", type=str, help="Candidate file for candidate model outputs.") 11 | parser.add_argument("--metric", default="unique", type=str, help="Metric to use for comparison.") 12 | parser.add_argument("--tokenizer", default="/net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B", type=str, help="Tokenizer to use for tokenization.") 13 | args = parser.parse_args() 14 | 15 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=False) 16 | 17 | def count_unique_tokens(text): 18 | return len(set(tokenizer(text).input_ids)) 19 | 20 | def count_token_length(text): 21 | return len(tokenizer(text).input_ids) 22 | 23 | metric_map = { 24 | "unique": count_unique_tokens, 25 | "length": count_token_length, 26 | } 27 | 28 | if __name__ == "__main__": 29 | # load reference data 30 | reference_dataset = load_dataset("hamishivi/alpaca-farm-davinci-003-2048-token") 31 | reference_dataset = [x["output"] for x in reference_dataset["train"]] 32 | # load candidate data 33 | with open(args.candidate_file, "r") as f: 34 | candidate_dataset = json.load(f) 35 | candidate_dataset = [x["output"] for x in candidate_dataset] 36 | win_counter = 0 37 | lose_counter = 0 38 | tie_counter = 0 39 | # compute metrics - we assume same order of reference and candidate data 40 | for reference_sample, candidate_sample in zip(reference_dataset, candidate_dataset): 41 | reference_metric = metric_map[args.metric](reference_sample) 42 | candidate_metric = metric_map[args.metric](candidate_sample) 43 | if reference_metric > candidate_metric: 44 | lose_counter += 1 45 | elif reference_metric < candidate_metric: 46 | win_counter += 1 47 | else: 48 | tie_counter += 1 49 | 50 | print(f"{win_counter}\t{lose_counter}\t{tie_counter}") 51 | -------------------------------------------------------------------------------- /scripts/eval/alpaca_farm.sh: -------------------------------------------------------------------------------- 1 | # Please make sure OPENAI_API_KEY is set in your environment variables 2 | 3 | # Use V1 of alpaca farm evaluation. 4 | export IS_ALPACA_EVAL_2=False 5 | 6 | # use vllm for generation 7 | python -m eval.alpaca_farm.run_eval \ 8 | --model_name_or_path ../checkpoints/tulu_v1_7B/ \ 9 | --tokenizer_name_or_path ../checkpoints/tulu_v1_7B/ \ 10 | --save_dir results/alpaca_farm/tulu_v1_7B/ \ 11 | --eval_batch_size 20 \ 12 | --use_vllm \ 13 | --use_chat_format \ 14 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 15 | 16 | 17 | # use normal huggingface generation function 18 | python -m eval.alpaca_farm.run_eval \ 19 | --model_name_or_path ../checkpoints/tulu_v1_7B/ \ 20 | --tokenizer_name_or_path ../checkpoints/tulu_v1_7B/ \ 21 | --save_dir results/alpaca_farm/tulu_v1_7B/ \ 22 | --eval_batch_size 20 \ 23 | --use_chat_format \ 24 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \ 25 | --load_in_8bit 26 | -------------------------------------------------------------------------------- /scripts/eval/alpaca_farm2.sh: -------------------------------------------------------------------------------- 1 | # Please make sure OPENAI_API_KEY is set in your environment variables 2 | 3 | # Use V1 of alpaca farm evaluation. 4 | export IS_ALPACA_EVAL_2=False 5 | 6 | # Evaluating LIMA baseline 7b model using and chat format 7 | python -m eval.alpaca_farm.run_eval \ 8 | --model_name_or_path output//lima_baseline_7B20e/ \ 9 | --tokenizer_name_or_path output//lima_baseline_7B20e/ \ 10 | --save_dir results/alpaca_farm/lima_baseline_7B20e/ \ 11 | --eval_batch_size 20 \ 12 | --use_vllm \ 13 | --use_chat_format \ 14 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 15 | 16 | # Evaluating LIMA noins 7b model using and chat format 17 | python -m eval.alpaca_farm.run_eval \ 18 | --model_name_or_path output//lima_noins_7B20e/ \ 19 | --tokenizer_name_or_path output//lima_noins_7B20e/ \ 20 | --save_dir results/alpaca_farm/lima_noins_7B20e/ \ 21 | --eval_batch_size 20 \ 22 | --use_vllm \ 23 | --use_chat_format \ 24 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 25 | -------------------------------------------------------------------------------- /scripts/eval/bbh.sh: -------------------------------------------------------------------------------- 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation. 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | 5 | # evaluating baseline 7B model using chain-of-thought and chat format 6 | python -m eval.bbh.run_eval \ 7 | --data_dir data/eval/bbh \ 8 | --save_dir results/bbh/lima_baseline_7B/ \ 9 | --model output//lima_baseline_7B/ \ 10 | --tokenizer output//lima_baseline_7B/ \ 11 | --max_num_examples_per_task 40 \ 12 | --use_vllm \ 13 | --use_chat_format \ 14 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 15 | 16 | # evaluating noins 7B model using chain-of-thought and chat format 17 | python -m eval.bbh.run_eval \ 18 | --data_dir data/eval/bbh \ 19 | --save_dir results/bbh/lima_noins_7B/ \ 20 | --model output//lima_noins_7B/ \ 21 | --tokenizer output//lima_noins_7B/ \ 22 | --max_num_examples_per_task 40 \ 23 | --use_vllm \ 24 | --use_chat_format \ 25 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 26 | 27 | 28 | -------------------------------------------------------------------------------- /scripts/eval/bbh2.sh: -------------------------------------------------------------------------------- 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation. 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | 5 | # evaluating baseline 7B model using chain-of-thought and chat format 6 | python -m eval.bbh.run_eval \ 7 | --data_dir data/eval/bbh \ 8 | --save_dir results/bbh/lima_baseline_7B/ \ 9 | --model output//lima_baseline_7B/ \ 10 | --tokenizer output//lima_baseline_7B/ \ 11 | --max_num_examples_per_task 40 \ 12 | --use_vllm \ 13 | --use_chat_format \ 14 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 15 | 16 | # evaluating noins 7B model using chain-of-thought and chat format 17 | python -m eval.bbh.run_eval \ 18 | --data_dir data/eval/bbh \ 19 | --save_dir results/bbh/lima_noins_7B/ \ 20 | --model output//lima_noins_7B/ \ 21 | --tokenizer output//lima_noins_7B/ \ 22 | --max_num_examples_per_task 40 \ 23 | --use_vllm \ 24 | --use_chat_format \ 25 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 26 | 27 | 28 | -------------------------------------------------------------------------------- /scripts/eval/gsm2.sh: -------------------------------------------------------------------------------- 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation. 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | # Evaluating LIMA baseline 7b model using chain-of-thought and chat format 5 | python -m eval.gsm.run_eval \ 6 | --data_dir data/eval/gsm/ \ 7 | --max_num_examples 200 \ 8 | --save_dir results/gsm/lima_baseline_7B-cot-8shot \ 9 | --model output//lima_baseline_7B/ \ 10 | --tokenizer output//lima_baseline_7B/ \ 11 | --n_shot 8 \ 12 | --use_chat_format \ 13 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \ 14 | --use_vllm 15 | 16 | # Evaluating LIMA noins 7b model using chain-of-thought and chat format 17 | python -m eval.gsm.run_eval \ 18 | --data_dir data/eval/gsm/ \ 19 | --max_num_examples 200 \ 20 | --save_dir results/gsm/lima_noins_7B-cot-8shot \ 21 | --model output//lima_noins_7B/ \ 22 | --tokenizer output//lima_noins_7B/ \ 23 | --n_shot 8 \ 24 | --use_chat_format \ 25 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \ 26 | --use_vllm 27 | 28 | 29 | ## Evaluating llama2 chat model using chain-of-thought and chat format 30 | #python -m eval.gsm.run_eval \ 31 | # --data_dir data/eval/gsm/ \ 32 | # --max_num_examples 200 \ 33 | # --save_dir results/gsm/llama2-chat-7B-cot-8shot \ 34 | # --model ../hf_llama2_models/7B-chat \ 35 | # --tokenizer ../hf_llama2_models/7B-chat \ 36 | # --n_shot 8 \ 37 | # --use_chat_format \ 38 | # --chat_formatting_function eval.templates.create_prompt_with_llama2_chat_format \ 39 | # --use_vllm 40 | -------------------------------------------------------------------------------- /scripts/eval/ifeval.sh: -------------------------------------------------------------------------------- 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation. 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | # Evaluating baseline 7B model using chat format 5 | python -m eval.ifeval.run_eval \ 6 | --data_dir data/eval/ifeval/ \ 7 | --save_dir results/ifeval/lima_baseline_7B \ 8 | --model output//lima_baseline_7B/ \ 9 | --tokenizer output//lima_baseline_7B/ \ 10 | --use_chat_format \ 11 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \ 12 | --use_vllm 13 | 14 | # Evaluating noins 7B model using chat format 15 | python -m eval.ifeval.run_eval \ 16 | --data_dir data/eval/ifeval/ \ 17 | --save_dir results/ifeval/lima_noins_7B \ 18 | --model output//lima_noins_7B/ \ 19 | --tokenizer output//lima_noins_7B/ \ 20 | --use_chat_format \ 21 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \ 22 | --use_vllm 23 | 24 | -------------------------------------------------------------------------------- /scripts/eval/ifeval2.sh: -------------------------------------------------------------------------------- 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation. 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | # Evaluating baseline 7B model using chat format 5 | python -m eval.ifeval.run_eval \ 6 | --data_dir data/eval/ifeval/ \ 7 | --save_dir results/ifeval/lima_baseline_7B \ 8 | --model output//lima_baseline_7B/ \ 9 | --tokenizer output//lima_baseline_7B/ \ 10 | --use_chat_format \ 11 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \ 12 | --use_vllm 13 | 14 | # Evaluating noins 7B model using chat format 15 | python -m eval.ifeval.run_eval \ 16 | --data_dir data/eval/ifeval/ \ 17 | --save_dir results/ifeval/lima_noins_7B \ 18 | --model output//lima_noins_7B/ \ 19 | --tokenizer output//lima_noins_7B/ \ 20 | --use_chat_format \ 21 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \ 22 | --use_vllm 23 | 24 | -------------------------------------------------------------------------------- /scripts/eval/mmlu2.sh: -------------------------------------------------------------------------------- 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation. 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | 5 | # Evaluating LIMA baseline 7B model using 0 shot and chat format 6 | python -m eval.mmlu.run_eval \ 7 | --ntrain 0 \ 8 | --data_dir data/eval/mmlu \ 9 | --save_dir results/mmlu/lima_baseline_7B-cot-0shot \ 10 | --model output//lima_baseline_7B/ \ 11 | --tokenizer output//lima_baseline_7B/ \ 12 | --eval_batch_size 4 \ 13 | --load_in_8bit \ 14 | --use_chat_format \ 15 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 16 | 17 | # Evaluating LIMA baseline 7B model using 5 shot and chat format 18 | python -m eval.mmlu.run_eval \ 19 | --ntrain 5 \ 20 | --data_dir data/eval/mmlu \ 21 | --save_dir results/mmlu/lima_baseline_7B-cot-5shot \ 22 | --model output//lima_baseline_7B/ \ 23 | --tokenizer output//lima_baseline_7B/ \ 24 | --eval_batch_size 4 \ 25 | --load_in_8bit \ 26 | --use_chat_format \ 27 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 28 | 29 | 30 | 31 | # Evaluating LIMA noins 7B model using 0 shot and chat format 32 | python -m eval.mmlu.run_eval \ 33 | --ntrain 0 \ 34 | --data_dir data/eval/mmlu \ 35 | --save_dir results/mmlu/lima_noins_7B-cot-0shot \ 36 | --model output//lima_noins_7B/ \ 37 | --tokenizer output//lima_noins_7B/ \ 38 | --eval_batch_size 4 \ 39 | --load_in_8bit \ 40 | --use_chat_format \ 41 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 42 | 43 | # Evaluating LIMA noins 7B model using 5 shot and chat format 44 | python -m eval.mmlu.run_eval \ 45 | --ntrain 5 \ 46 | --data_dir data/eval/mmlu \ 47 | --save_dir results/mmlu/lima_noins_7B-cot-5shot \ 48 | --model output//lima_noins_7B/ \ 49 | --tokenizer output//lima_noins_7B/ \ 50 | --eval_batch_size 4 \ 51 | --load_in_8bit \ 52 | --use_chat_format \ 53 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 54 | -------------------------------------------------------------------------------- /scripts/eval/toxigen.sh: -------------------------------------------------------------------------------- 1 | # example scripts for toxigen 2 | 3 | # evaluate an open-instruct model with chat format 4 | python -m eval.toxigen.run_eval \ 5 | --data_dir data/eval/toxigen/ \ 6 | --save_dir tulu_65b \ 7 | --model_name_or_path tulu_65b/ \ 8 | --use_vllm \ 9 | --use_chat_format \ 10 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format 11 | 12 | 13 | # evaluate a base model without chat format 14 | python -m eval.toxigen.run_eval \ 15 | --data_dir data/eval/toxigen/ \ 16 | --save_dir tulu_65b \ 17 | --model_name_or_path tulu_65b/ \ 18 | --use_vllm 19 | 20 | 21 | # evaluate chatGPT 22 | python -m eval.toxigen.run_eval \ 23 | --data_dir data/eval/toxigen/ \ 24 | --save_dir results/toxigen/chatgpt \ 25 | --openai_engine gpt-3.5-turbo-0301 \ 26 | --max_prompts_per_group 100 \ 27 | --eval_batch_size 20 28 | 29 | 30 | # evaluate gpt4 31 | python -m eval.toxigen.run_eval \ 32 | --data_dir data/eval/toxigen/ \ 33 | --save_dir results/toxigen/gpt4 \ 34 | --openai_engine gpt-4-0314 \ 35 | --max_prompts_per_group 100 \ 36 | --eval_batch_size 20 -------------------------------------------------------------------------------- /scripts/eval/xstest.sh: -------------------------------------------------------------------------------- 1 | # Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation. 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | 5 | # Evaluating tulu 7B model using chat format 6 | python -m eval.xstest.run_eval \ 7 | --data_dir data/eval/xstest/ \ 8 | --save_dir results/xstest/tulu-7B-sft \ 9 | --model ../checkpoints/tulu2/7B-sft \ 10 | --tokenizer ../checkpoints/tulu2/7B-sft \ 11 | --use_chat_format \ 12 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \ 13 | --use_vllm 14 | 15 | 16 | # Evaluating tulu 70B dpo model using chat format 17 | python -m eval.xstest.run_eval \ 18 | --data_dir data/eval/xstest/ \ 19 | --save_dir results/xstest/tulu-70B-dpo \ 20 | --model allenai/tulu-2-dpo-70b \ 21 | --tokenizer allenai/tulu-2-dpo-70b \ 22 | --use_chat_format \ 23 | --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format \ 24 | --use_vllm 25 | 26 | 27 | # Evaluating chatgpt 28 | python -m eval.xstest.run_eval \ 29 | --data_dir data/eval/xstest/ \ 30 | --save_dir results/xstest/chatgpt-no-cot \ 31 | --openai_engine "gpt-3.5-turbo-0125" \ 32 | --eval_batch_size 20 33 | 34 | 35 | # Evaluating gpt4 36 | python -m eval.xstest.run_eval \ 37 | --data_dir data/eval/xstest/ \ 38 | --save_dir results/xstest/gpt4-cot \ 39 | --openai_engine "gpt-4-0613" \ 40 | --eval_batch_size 20 -------------------------------------------------------------------------------- /scripts/finetune_lora_with_accelerate.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | MODEL_SIZE=7B 4 | NUM_GPUS=4 5 | BATCH_SIZE_PER_GPU=1 6 | TOTAL_BATCH_SIZE=128 7 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 8 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 9 | 10 | # Lora training 11 | accelerate launch \ 12 | --mixed_precision bf16 \ 13 | --num_machines 1 \ 14 | --num_processes $NUM_GPUS \ 15 | --use_deepspeed \ 16 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 17 | open_instruct/finetune.py \ 18 | --model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \ 19 | --use_flash_attn \ 20 | --use_lora \ 21 | --lora_rank 64 \ 22 | --lora_alpha 16 \ 23 | --lora_dropout 0.1 \ 24 | --tokenizer_name ../hf_llama2_models/${MODEL_SIZE} \ 25 | --use_slow_tokenizer \ 26 | --train_file oasst1_data.jsonl \ 27 | --max_seq_length 4096 \ 28 | --preprocessing_num_workers 16 \ 29 | --checkpointing_steps epoch \ 30 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 31 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 32 | --learning_rate 1e-4 \ 33 | --lr_scheduler_type linear \ 34 | --warmup_ratio 0.03 \ 35 | --weight_decay 0. \ 36 | --num_train_epochs 5 \ 37 | --output_dir output/tulu_v2_${MODEL_SIZE}_lora/ \ 38 | --with_tracking \ 39 | --report_to tensorboard \ 40 | --logging_steps 1 && 41 | 42 | python open_instruct/merge_lora.py \ 43 | --base_model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \ 44 | --lora_model_name_or_path output/tulu_v2_${MODEL_SIZE}_lora/ \ 45 | --output_dir output/tulu_v2_${MODEL_SIZE}_lora_merged/ \ 46 | --save_tokenizer 47 | -------------------------------------------------------------------------------- /scripts/finetune_qlora_with_accelerate.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 2 | 3 | MODEL_SIZE=70B 4 | NUM_GPUS=8 5 | BATCH_SIZE_PER_GPU=1 6 | TOTAL_BATCH_SIZE=128 7 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 8 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 9 | 10 | # Lora training 11 | accelerate launch \ 12 | --num_machines 1 \ 13 | --num_processes $NUM_GPUS \ 14 | open_instruct/finetune.py \ 15 | --model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \ 16 | --gradient_checkpointing \ 17 | --use_qlora \ 18 | --use_lora \ 19 | --use_flash_attn \ 20 | --lora_rank 64 \ 21 | --lora_alpha 16 \ 22 | --lora_dropout 0.1 \ 23 | --tokenizer_name ../hf_llama2_models/${MODEL_SIZE} \ 24 | --use_slow_tokenizer \ 25 | --train_file data/processed/tulu_v2/tulu_v2_data.jsonl \ 26 | --max_seq_length 4096 \ 27 | --preprocessing_num_workers 128 \ 28 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 29 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 30 | --learning_rate 1e-4 \ 31 | --lr_scheduler_type linear \ 32 | --warmup_ratio 0.03 \ 33 | --weight_decay 0. \ 34 | --num_train_epochs 5 \ 35 | --output_dir output/tulu_v2_${MODEL_SIZE}_qlora/ \ 36 | --with_tracking \ 37 | --report_to tensorboard \ 38 | --logging_steps 1 && 39 | 40 | python open_instruct/merge_lora.py \ 41 | --base_model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \ 42 | --lora_model_name_or_path output/tulu_v2_${MODEL_SIZE}_qlora/ \ 43 | --output_dir output/tulu_v2_${MODEL_SIZE}_qlora_merged/ \ 44 | --qlora \ 45 | --save_tokenizer 46 | -------------------------------------------------------------------------------- /scripts/finetune_with_accelerate.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | MODEL_SIZE=7B 4 | NUM_GPUS=4 5 | BATCH_SIZE_PER_GPU=1 6 | TOTAL_BATCH_SIZE=128 7 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 8 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 9 | 10 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 11 | # but it will trade off speed. 12 | accelerate launch \ 13 | --mixed_precision bf16 \ 14 | --num_machines 1 \ 15 | --num_processes $NUM_GPUS \ 16 | --use_deepspeed \ 17 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 18 | open_instruct/finetune.py \ 19 | --model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \ 20 | --use_flash_attn \ 21 | --tokenizer_name ../hf_llama2_models/${MODEL_SIZE} \ 22 | --use_slow_tokenizer \ 23 | --train_file data/processed/tulu_v2/tulu_v2_data.jsonl \ 24 | --max_seq_length 8192 \ 25 | --preprocessing_num_workers 128 \ 26 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 27 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 28 | --learning_rate 2e-5 \ 29 | --lr_scheduler_type linear \ 30 | --warmup_ratio 0.03 \ 31 | --weight_decay 0. \ 32 | --num_train_epochs 2 \ 33 | --output_dir output/tulu_v2_${MODEL_SIZE}/ \ 34 | --with_tracking \ 35 | --report_to tensorboard \ 36 | --logging_steps 1 -------------------------------------------------------------------------------- /scripts/finetune_with_hf_trainer.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | MODEL_SIZE=7B 4 | NUM_GPUS=4 5 | BATCH_SIZE_PER_GPU=1 6 | TOTAL_BATCH_SIZE=128 7 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 8 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 9 | 10 | deepspeed --include localhost:0,1,2,3 open_instruct/finetune_trainer.py \ 11 | --deepspeed ds_configs/stage3_no_offloading.conf \ 12 | --model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \ 13 | --tokenizer_name ../hf_llama2_models/${MODEL_SIZE} \ 14 | --use_flash_attn True \ 15 | --use_fast_tokenizer False \ 16 | --train_file data/processed/tulu_v2/tulu_v2_data.jsonl \ 17 | --max_seq_length 8192 \ 18 | --preprocessing_num_workers 64 \ 19 | --do_train \ 20 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 21 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 22 | --learning_rate 2e-5 \ 23 | --lr_scheduler_type linear \ 24 | --warmup_ratio 0.03 \ 25 | --weight_decay 0. \ 26 | --evaluation_strategy "no" \ 27 | --logging_steps 1 \ 28 | --save_strategy epoch \ 29 | --save_total_limit 1 \ 30 | --num_train_epochs 2 \ 31 | --output_dir output/tulu_v2_${MODEL_SIZE}/ \ 32 | --bf16 \ 33 | --tf32 True \ 34 | --torch_dtype bfloat16 \ 35 | --overwrite_output_dir \ 36 | --report_to "tensorboard" -------------------------------------------------------------------------------- /scripts/get_statistics.sh: -------------------------------------------------------------------------------- 1 | # ["super_ni", "cot", "flan_v2", "self_instruct", "unnatural_instructions", "stanford_alpaca", "dolly", "sharegpt", "code_alpaca", "gpt4_alpaca", "baize", "oasst1"] 2 | 3 | # for every dataset, get the statistics 4 | for dataset in super_ni cot flan_v2 self_instruct unnatural_instructions stanford_alpaca dolly sharegpt code_alpaca gpt4_alpaca baize oasst1 lima wizardlm open_orca; do 5 | echo "Getting statistics for $dataset..." 6 | python open_instruct/get_statistics.py --data_path data/processed/${dataset}/${dataset}_data.jsonl --save_path data/processed/${dataset}/${dataset}_statistics.json 7 | done -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_llama/ins/finetune_10e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima 18 | epochs=10 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29507 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_data.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_llama/ins/finetune_15e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=0,1 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 6 | 7 | MODEL_SIZE=7B 8 | MODELNAME=baseline 9 | NUM_GPUS=2 10 | BATCH_SIZE_PER_GPU=1 11 | TOTAL_BATCH_SIZE=64 12 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 13 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 14 | 15 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 16 | # but it will trade off speed. 17 | 18 | DSNAME=lima 19 | epochs=15 20 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 21 | 22 | accelerate launch \ 23 | --mixed_precision bf16 \ 24 | --num_machines 1 \ 25 | --num_processes $NUM_GPUS \ 26 | --use_deepspeed \ 27 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 28 | --main_process_port 29508 \ 29 | open_instruct/finetune.py \ 30 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 31 | --use_flash_attn \ 32 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 33 | --use_slow_tokenizer \ 34 | --train_file data/processed/lima/lima_data.jsonl \ 35 | --max_seq_length 2048 \ 36 | --preprocessing_num_workers 128 \ 37 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 38 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 39 | --learning_rate 1e-5 \ 40 | --lr_scheduler_type linear \ 41 | --warmup_ratio 0.03 \ 42 | --weight_decay 0. \ 43 | --num_train_epochs ${epochs} \ 44 | --output_dir output/${model}/ \ 45 | --with_tracking \ 46 | --report_to tensorboard \ 47 | --logging_steps 1 48 | 49 | 50 | export CUDA_VISIBLE_DEVICES=0 51 | 52 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 53 | 54 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 55 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_llama/ins/finetune_20e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima 18 | epochs=20 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29509 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_data.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_llama/ins/finetune_5e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=0,1 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 6 | 7 | MODEL_SIZE=7B 8 | MODELNAME=baseline 9 | NUM_GPUS=2 10 | BATCH_SIZE_PER_GPU=1 11 | TOTAL_BATCH_SIZE=64 12 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 13 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 14 | 15 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 16 | # but it will trade off speed. 17 | 18 | DSNAME=lima 19 | epochs=5 20 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 21 | 22 | accelerate launch \ 23 | --mixed_precision bf16 \ 24 | --num_machines 1 \ 25 | --num_processes $NUM_GPUS \ 26 | --use_deepspeed \ 27 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 28 | --main_process_port 29510 \ 29 | open_instruct/finetune.py \ 30 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 31 | --use_flash_attn \ 32 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 33 | --use_slow_tokenizer \ 34 | --train_file data/processed/lima/lima_data.jsonl \ 35 | --max_seq_length 2048 \ 36 | --preprocessing_num_workers 128 \ 37 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 38 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 39 | --learning_rate 1e-5 \ 40 | --lr_scheduler_type linear \ 41 | --warmup_ratio 0.03 \ 42 | --weight_decay 0. \ 43 | --num_train_epochs ${epochs} \ 44 | --output_dir output/${model}/ \ 45 | --with_tracking \ 46 | --report_to tensorboard \ 47 | --logging_steps 1 48 | 49 | 50 | export CUDA_VISIBLE_DEVICES=0 51 | 52 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 53 | 54 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 55 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_llama/ins/finetune_7e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=0,1 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 6 | 7 | MODEL_SIZE=7B 8 | MODELNAME=baseline 9 | NUM_GPUS=2 10 | BATCH_SIZE_PER_GPU=1 11 | TOTAL_BATCH_SIZE=64 12 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 13 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 14 | 15 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 16 | # but it will trade off speed. 17 | 18 | DSNAME=lima 19 | epochs=7 20 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 21 | 22 | accelerate launch \ 23 | --mixed_precision bf16 \ 24 | --num_machines 1 \ 25 | --num_processes $NUM_GPUS \ 26 | --use_deepspeed \ 27 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 28 | --main_process_port 29511 \ 29 | open_instruct/finetune.py \ 30 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 31 | --use_flash_attn \ 32 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 33 | --use_slow_tokenizer \ 34 | --train_file data/processed/lima/lima_data.jsonl \ 35 | --max_seq_length 2048 \ 36 | --preprocessing_num_workers 128 \ 37 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 38 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 39 | --learning_rate 1e-5 \ 40 | --lr_scheduler_type linear \ 41 | --warmup_ratio 0.03 \ 42 | --weight_decay 0. \ 43 | --num_train_epochs ${epochs} \ 44 | --output_dir output/${model}/ \ 45 | --with_tracking \ 46 | --report_to tensorboard \ 47 | --logging_steps 1 48 | 49 | 50 | export CUDA_VISIBLE_DEVICES=0 51 | 52 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 53 | 54 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 55 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_llama/run_expt_val.sh: -------------------------------------------------------------------------------- 1 | # Validation 2 | ## Baseline (Instruction Tuning) 3 | for f in scripts/iclr2025/expt1/res/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 $f; done 4 | ## Response Tuning 5 | for f in scripts/iclr2025/expt1/ins/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 $f; done 6 | 7 | # Test 8 | # Baseline seeds 9 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 scripts/iclr2025/expt1/ins/finetune_seed${seed}.sh; done 10 | # Response tuning seeds 11 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 scripts/iclr2025/expt1/res/finetune_seed${seed}.sh; done 12 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_olmo/ins/finetune_10e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima3e-6 18 | epochs=10 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29507 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_data.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_olmo/ins/finetune_15e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima3e-6 18 | epochs=15 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29508 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_data.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_olmo/ins/finetune_20e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima3e-6 18 | epochs=20 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29509 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_data.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_olmo/ins/finetune_5e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima3e-6 18 | epochs=5 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29510 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_data.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_olmo/ins/finetune_7e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima3e-6 18 | epochs=7 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29511 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_data.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_olmo/res/finetune_10e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=response 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima3e-6 18 | epochs=10 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29507 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_no_instruction.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_olmo/res/finetune_15e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=response 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima3e-6 18 | epochs=15 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29508 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_no_instruction.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_olmo/res/finetune_20e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=response 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima3e-6 18 | epochs=20 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29509 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_no_instruction.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_olmo/res/finetune_5e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=response 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=lima3e-6 18 | epochs=5 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29510 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/lima/lima_no_instruction.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt1_olmo/run_expt_val.sh: -------------------------------------------------------------------------------- 1 | # Validation 2 | ## Baseline (Instruction Tuning) 3 | for f in scripts/iclr2025/expt1/res/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 $f; done 4 | ## Response Tuning 5 | for f in scripts/iclr2025/expt1/ins/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 $f; done 6 | 7 | # Test 8 | # Baseline seeds 9 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 scripts/iclr2025/expt1/ins/finetune_seed${seed}.sh; done 10 | # Response tuning seeds 11 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 scripts/iclr2025/expt1/res/finetune_seed${seed}.sh; done 12 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/gsm/finetune_10e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm 18 | epochs=10 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29507 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/gsm/finetune_15e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm 18 | epochs=15 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29508 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/gsm/finetune_20e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm 18 | epochs=20 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29509 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/gsm/finetune_5e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm 18 | epochs=5 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29510 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/gsm/finetune_7e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm 18 | epochs=7 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29511 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/mbpp/finetune_10e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=mbpp 18 | epochs=10 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29507 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/mbpp/mbpp.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/mbpp/finetune_15e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=mbpp 18 | epochs=15 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29508 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/mbpp/mbpp.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/mbpp/finetune_20e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=mbpp 18 | epochs=20 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29509 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/mbpp/mbpp.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/mbpp/finetune_5e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=mbpp 18 | epochs=5 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29510 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/mbpp/mbpp.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/mbpp/finetune_7e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=mbpp 18 | epochs=7 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29511 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/mbpp/mbpp.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/pgn/finetune_10e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=pgn 18 | epochs=10 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29507 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/pgn/pgn.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/pgn/finetune_15e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=pgn 18 | epochs=15 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29508 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/pgn/pgn.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/pgn/finetune_20e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=pgn 18 | epochs=20 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29509 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/pgn/pgn.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/pgn/finetune_5e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=pgn 18 | epochs=5 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29510 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/pgn/pgn.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_llama/pgn/finetune_7e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=pgn 18 | epochs=7 19 | model=${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29511 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path meta-llama/Llama-2-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name meta-llama/Llama-2-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/pgn/pgn.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 1e-5 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/gsm/finetune_10e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm3e-6 18 | epochs=10 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29507 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/gsm/finetune_15e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm3e-6 18 | epochs=15 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29508 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/gsm/finetune_20e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm3e-6 18 | epochs=20 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29509 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/gsm/finetune_2e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm3e-6 18 | epochs=2 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29510 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/gsm/finetune_5e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm3e-6 18 | epochs=5 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29510 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/gsm/finetune_7e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=gsm3e-6 18 | epochs=7 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29511 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/gsm_train/gsm8k.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/mbpp/finetune_10e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=mbpp3e-6 18 | epochs=10 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29507 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/mbpp/mbpp.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/mbpp/finetune_15e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=mbpp3e-6 18 | epochs=15 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29508 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/mbpp/mbpp.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/mbpp/finetune_20e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=mbpp3e-6 18 | epochs=20 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29509 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/mbpp/mbpp.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/mbpp/finetune_5e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=mbpp3e-6 18 | epochs=5 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29510 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/mbpp/mbpp.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/mbpp/finetune_7e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=0,1 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 6 | 7 | MODEL_SIZE=7B 8 | MODELNAME=baseline 9 | NUM_GPUS=2 10 | BATCH_SIZE_PER_GPU=1 11 | TOTAL_BATCH_SIZE=64 12 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 13 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 14 | 15 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 16 | # but it will trade off speed. 17 | 18 | DSNAME=mbpp3e-6 19 | epochs=7 20 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 21 | 22 | accelerate launch \ 23 | --mixed_precision bf16 \ 24 | --num_machines 1 \ 25 | --num_processes $NUM_GPUS \ 26 | --use_deepspeed \ 27 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 28 | --main_process_port 29511 \ 29 | open_instruct/finetune.py \ 30 | --model_name_or_path allenai/OLMo-7B-hf \ 31 | --use_flash_attn \ 32 | --tokenizer_name allenai/OLMo-7B-hf \ 33 | --use_slow_tokenizer \ 34 | --train_file data/processed/mbpp/mbpp.jsonl \ 35 | --max_seq_length 2048 \ 36 | --preprocessing_num_workers 128 \ 37 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 38 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 39 | --learning_rate 3e-6 \ 40 | --lr_scheduler_type linear \ 41 | --warmup_ratio 0.03 \ 42 | --weight_decay 0. \ 43 | --num_train_epochs ${epochs} \ 44 | --output_dir output/${model}/ \ 45 | --with_tracking \ 46 | --report_to tensorboard \ 47 | --logging_steps 1 48 | 49 | 50 | export CUDA_VISIBLE_DEVICES=0 51 | 52 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 53 | 54 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 55 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/pgn/finetune_10e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=pgn3e-6 18 | epochs=10 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29507 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/pgn/pgn.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/pgn/finetune_15e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=pgn3e-6 18 | epochs=15 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29508 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/pgn/pgn.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/pgn/finetune_20e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=pgn3e-6 18 | epochs=20 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29509 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/pgn/pgn.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/pgn/finetune_5e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=pgn3e-6 18 | epochs=5 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29510 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/pgn/pgn.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/pgn/finetune_7e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=pgn3e-6 18 | epochs=7 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29511 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/pgn/pgn.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/poetry/finetune_10e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=poetry3e-6 18 | epochs=10 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29507 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/poetry/poetry.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/poetry/finetune_20e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=poetry3e-6 18 | epochs=20 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29509 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/poetry/poetry.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/poetry/finetune_5e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=poetry3e-6 18 | epochs=5 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29510 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/poetry/poetry.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/poetry/finetune_7e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 5 | 6 | MODEL_SIZE=7B 7 | MODELNAME=baseline 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | 17 | DSNAME=poetry3e-6 18 | epochs=7 19 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 20 | 21 | accelerate launch \ 22 | --mixed_precision bf16 \ 23 | --num_machines 1 \ 24 | --num_processes $NUM_GPUS \ 25 | --use_deepspeed \ 26 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 27 | --main_process_port 29511 \ 28 | open_instruct/finetune.py \ 29 | --model_name_or_path allenai/OLMo-7B-hf \ 30 | --use_flash_attn \ 31 | --tokenizer_name allenai/OLMo-7B-hf \ 32 | --use_slow_tokenizer \ 33 | --train_file data/processed/poetry/poetry.jsonl \ 34 | --max_seq_length 2048 \ 35 | --preprocessing_num_workers 128 \ 36 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 37 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 38 | --learning_rate 3e-6 \ 39 | --lr_scheduler_type linear \ 40 | --warmup_ratio 0.03 \ 41 | --weight_decay 0. \ 42 | --num_train_epochs ${epochs} \ 43 | --output_dir output/${model}/ \ 44 | --with_tracking \ 45 | --report_to tensorboard \ 46 | --logging_steps 1 47 | 48 | 49 | export CUDA_VISIBLE_DEVICES=0 50 | 51 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 52 | 53 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 54 | -------------------------------------------------------------------------------- /scripts/iclr2025/expt2_olmo/recipe/finetune_7e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export CUDA_VISIBLE_DEVICES=0,1 3 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 4 | 5 | MODEL_SIZE=7B 6 | MODELNAME=baseline 7 | NUM_GPUS=2 8 | BATCH_SIZE_PER_GPU=1 9 | TOTAL_BATCH_SIZE=64 10 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 11 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 12 | 13 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 14 | # but it will trade off speed. 15 | 16 | DSNAME=recipe3e-6 17 | epochs=7 18 | model=olmo${DSNAME}${MODELNAME}${MODEL_SIZE}ep${epochs} 19 | 20 | accelerate launch \ 21 | --mixed_precision bf16 \ 22 | --num_machines 1 \ 23 | --num_processes $NUM_GPUS \ 24 | --use_deepspeed \ 25 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 26 | --main_process_port 29511 \ 27 | open_instruct/finetune.py \ 28 | --model_name_or_path allenai/OLMo-7B-hf \ 29 | --use_flash_attn \ 30 | --tokenizer_name allenai/OLMo-7B-hf \ 31 | --use_slow_tokenizer \ 32 | --train_file data/processed/kaggle_food_recipes/kfr.jsonl \ 33 | --max_seq_length 2048 \ 34 | --preprocessing_num_workers 128 \ 35 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 36 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 37 | --learning_rate 3e-6 \ 38 | --lr_scheduler_type linear \ 39 | --warmup_ratio 0.03 \ 40 | --weight_decay 0. \ 41 | --num_train_epochs ${epochs} \ 42 | --output_dir output/${model}/ \ 43 | --with_tracking \ 44 | --report_to tensorboard \ 45 | --logging_steps 1 46 | 47 | 48 | export CUDA_VISIBLE_DEVICES=0 49 | 50 | python -m eval.val_eval.run_eval --model_name_or_path output/${model}/ --tokenizer_name_or_path output/${model}/ --save_dir results/val_eval/${model}/ --eval_batch_size 10 --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_vllm 51 | 52 | alpaca_eval --model_outputs results/val_eval/${model}/${model}-greedy-long-output.json --reference_outputs eval/val_eval/val-gpt3.5-2.json 53 | -------------------------------------------------------------------------------- /scripts/iclr2025/no_rephrase_expt/run_expt_val.sh: -------------------------------------------------------------------------------- 1 | # Validation 2 | ## Baseline (Instruction Tuning) 3 | for f in scripts/iclr2025/expt1/res/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 $f; done 4 | ## Response Tuning 5 | for f in scripts/iclr2025/expt1/ins/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 $f; done 6 | 7 | # Test 8 | # Baseline seeds 9 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 scripts/iclr2025/expt1/ins/finetune_seed${seed}.sh; done 10 | # Response tuning seeds 11 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 scripts/iclr2025/expt1/res/finetune_seed${seed}.sh; done 12 | -------------------------------------------------------------------------------- /scripts/iclr2025/other_tags_expt/run_expt_val.sh: -------------------------------------------------------------------------------- 1 | # Validation 2 | ## Baseline (Instruction Tuning) 3 | for f in scripts/iclr2025/expt1/res/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 $f; done 4 | ## Response Tuning 5 | for f in scripts/iclr2025/expt1/ins/finetune_{5,7,10,15,20}e.sh; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 $f; done 6 | 7 | # Test 8 | # Baseline seeds 9 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 scripts/iclr2025/expt1/ins/finetune_seed${seed}.sh; done 10 | # Response tuning seeds 11 | for seed in 1 2 3 4 5 ; do sbatch --account nlp --partition sphinx --nodelist sphinx4 --gres gpu:2 --mem 50G --exclude jagupard19,jagupard20,jagupard26,jagupard27,jagupard28,jagupard29,jagupard30,jagupard31 scripts/iclr2025/expt1/res/finetune_seed${seed}.sh; done 12 | -------------------------------------------------------------------------------- /scripts/prepare_science_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mix together all datasets to create instruction tuning mix. 3 | """ 4 | 5 | from pathlib import Path 6 | import json 7 | import os 8 | 9 | 10 | def write_jsonl(xs, fname): 11 | with open(fname, "w") as f: 12 | for x in xs: 13 | print(json.dumps(x), file=f) 14 | 15 | 16 | def load_jsonl(fname): 17 | with open(fname) as f: 18 | return [json.loads(line) for line in f] 19 | 20 | 21 | names = [ 22 | "evidence_inference", 23 | "qasper_truncated_4000", 24 | "scifact_json", 25 | "scitldr_aic", 26 | "scierc_ner", 27 | "scierc_relation" 28 | ] 29 | 30 | # This is an instruction dataset about several science tasks that David and some other collaborators created. 31 | # Please contact us if you want to use the raw files 32 | data_dir = Path("../../davidw/proj/science-instruct/promptsource-sciit/prompts_davidw/tasks") 33 | out_dir = Path("data/raw_train/science") 34 | os.makedirs(out_dir, exist_ok=True) 35 | 36 | full_dataset = [] 37 | 38 | for name in names: 39 | ds = load_jsonl(data_dir / f"{name}_train.jsonl") 40 | for entry in ds: 41 | entry["dataset"] = name 42 | full_dataset.append(entry) 43 | 44 | write_jsonl(full_dataset, out_dir / "science_train.jsonl") -------------------------------------------------------------------------------- /scripts/prepare_train_data.sh: -------------------------------------------------------------------------------- 1 | # Downloading same as open-instruct 2 | # check if there is $HF_TOKEN in the environment variables 3 | if [ -z "$HF_TOKEN" ] 4 | then 5 | echo "Warning: HuggingFace dataset LIMA requires permissive access." 6 | echo "Warning: Please request the access at https://huggingface.co/datasets/GAIR/lima and set the HF_TOKEN environment variable before running this script." 7 | exit 1 8 | fi 9 | 10 | echo "Downloading Stanford alpaca data..." 11 | wget -P data/raw_train/stanford_alpaca/ https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json 12 | 13 | 14 | echo "Downloading LIMA dataset..." 15 | wget --header="Authorization: Bearer $HF_TOKEN" -P data/raw_train/lima/ https://huggingface.co/datasets/GAIR/lima/raw/main/train.jsonl 16 | 17 | echo "Processing datasets..." 18 | python open_instruct/reformat_datasets.py --raw_data_dir data/raw_train/ --output_dir data/processed/ 19 | 20 | # Now download and process datasets specific to this repository. 21 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_no_ins_try2.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | open_instruct/finetune.py \ 20 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 21 | --use_flash_attn \ 22 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 23 | --use_slow_tokenizer \ 24 | --train_file data/processed/lima/lima_no_instruction.jsonl \ 25 | --max_seq_length 2048 \ 26 | --preprocessing_num_workers 128 \ 27 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 28 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 29 | --learning_rate 1e-5 \ 30 | --lr_scheduler_type linear \ 31 | --warmup_ratio 0.03 \ 32 | --weight_decay 0. \ 33 | --num_train_epochs 15 \ 34 | --output_dir output/lima_noins_${MODEL_SIZE}_try2/ \ 35 | --with_tracking \ 36 | --report_to tensorboard \ 37 | --logging_steps 1 38 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_no_ins_try3.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | open_instruct/finetune.py \ 20 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 21 | --use_flash_attn \ 22 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 23 | --use_slow_tokenizer \ 24 | --train_file data/processed/lima/lima_no_instruction.jsonl \ 25 | --max_seq_length 2048 \ 26 | --preprocessing_num_workers 128 \ 27 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 28 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 29 | --learning_rate 1e-5 \ 30 | --lr_scheduler_type linear \ 31 | --warmup_ratio 0.03 \ 32 | --weight_decay 0. \ 33 | --num_train_epochs 10 \ 34 | --output_dir output/lima_noins_${MODEL_SIZE}_try3/ \ 35 | --with_tracking \ 36 | --report_to tensorboard \ 37 | --logging_steps 1 38 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_no_ins_try4.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | open_instruct/finetune.py \ 20 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 21 | --use_flash_attn \ 22 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 23 | --use_slow_tokenizer \ 24 | --train_file data/processed/lima/lima_no_instruction.jsonl \ 25 | --max_seq_length 2048 \ 26 | --preprocessing_num_workers 128 \ 27 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 28 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 29 | --learning_rate 1e-5 \ 30 | --lr_scheduler_type linear \ 31 | --warmup_ratio 0.03 \ 32 | --weight_decay 0. \ 33 | --num_train_epochs 20 \ 34 | --output_dir output/lima_noins_${MODEL_SIZE}_try4/ \ 35 | --with_tracking \ 36 | --report_to tensorboard \ 37 | --logging_steps 1 38 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_no_ins_try5.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | open_instruct/finetune.py \ 20 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 21 | --use_flash_attn \ 22 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 23 | --use_slow_tokenizer \ 24 | --train_file data/processed/lima/lima_no_instruction.jsonl \ 25 | --max_seq_length 2048 \ 26 | --preprocessing_num_workers 128 \ 27 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 28 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 29 | --learning_rate 1e-5 \ 30 | --lr_scheduler_type linear \ 31 | --warmup_ratio 0.03 \ 32 | --weight_decay 0. \ 33 | --num_train_epochs 30 \ 34 | --output_dir output/lima_noins_${MODEL_SIZE}_try5/ \ 35 | --with_tracking \ 36 | --report_to tensorboard \ 37 | --logging_steps 1 38 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_no_ins_try6.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | open_instruct/finetune.py \ 20 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 21 | --use_flash_attn \ 22 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 23 | --use_slow_tokenizer \ 24 | --train_file data/processed/lima/lima_no_instruction.jsonl \ 25 | --max_seq_length 2048 \ 26 | --preprocessing_num_workers 128 \ 27 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 28 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 29 | --learning_rate 1e-5 \ 30 | --lr_scheduler_type linear \ 31 | --warmup_ratio 0.03 \ 32 | --weight_decay 0. \ 33 | --num_train_epochs 7 \ 34 | --output_dir output/lima_noins_${MODEL_SIZE}_try6/ \ 35 | --with_tracking \ 36 | --report_to tensorboard \ 37 | --logging_steps 1 38 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_no_ins_try7.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | open_instruct/finetune.py \ 20 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 21 | --use_flash_attn \ 22 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 23 | --use_slow_tokenizer \ 24 | --train_file data/processed/lima/lima_no_instruction.jsonl \ 25 | --max_seq_length 2048 \ 26 | --preprocessing_num_workers 128 \ 27 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 28 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 29 | --learning_rate 1e-5 \ 30 | --lr_scheduler_type linear \ 31 | --warmup_ratio 0.03 \ 32 | --weight_decay 0. \ 33 | --num_train_epochs 5 \ 34 | --output_dir output/lima_noins_${MODEL_SIZE}_try7/ \ 35 | --with_tracking \ 36 | --report_to tensorboard \ 37 | --logging_steps 1 38 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_noins_plus_partial_try2.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | open_instruct/finetune.py \ 20 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 21 | --use_flash_attn \ 22 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 23 | --use_slow_tokenizer \ 24 | --train_file data/processed/lima/lima_noins_plus_partial.jsonl \ 25 | --max_seq_length 2048 \ 26 | --preprocessing_num_workers 128 \ 27 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 28 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 29 | --learning_rate 1e-5 \ 30 | --lr_scheduler_type linear \ 31 | --warmup_ratio 0.03 \ 32 | --weight_decay 0. \ 33 | --num_train_epochs 15 \ 34 | --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try2/ \ 35 | --with_tracking \ 36 | --report_to tensorboard \ 37 | --logging_steps 1 38 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_noins_plus_partial_try3.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | open_instruct/finetune.py \ 20 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 21 | --use_flash_attn \ 22 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 23 | --use_slow_tokenizer \ 24 | --train_file data/processed/lima/lima_noins_plus_partial.jsonl \ 25 | --max_seq_length 2048 \ 26 | --preprocessing_num_workers 128 \ 27 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 28 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 29 | --learning_rate 1e-5 \ 30 | --lr_scheduler_type linear \ 31 | --warmup_ratio 0.03 \ 32 | --weight_decay 0. \ 33 | --num_train_epochs 10 \ 34 | --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try3/ \ 35 | --with_tracking \ 36 | --report_to tensorboard \ 37 | --logging_steps 1 38 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_noins_plus_partial_try4.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | open_instruct/finetune.py \ 20 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 21 | --use_flash_attn \ 22 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 23 | --use_slow_tokenizer \ 24 | --train_file data/processed/lima/lima_noins_plus_partial.jsonl \ 25 | --max_seq_length 2048 \ 26 | --preprocessing_num_workers 128 \ 27 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 28 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 29 | --learning_rate 1e-5 \ 30 | --lr_scheduler_type linear \ 31 | --warmup_ratio 0.03 \ 32 | --weight_decay 0. \ 33 | --num_train_epochs 20 \ 34 | --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try4/ \ 35 | --with_tracking \ 36 | --report_to tensorboard \ 37 | --logging_steps 1 38 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_noins_plus_partial_try5.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | open_instruct/finetune.py \ 20 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 21 | --use_flash_attn \ 22 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 23 | --use_slow_tokenizer \ 24 | --train_file data/processed/lima/lima_noins_plus_partial.jsonl \ 25 | --max_seq_length 2048 \ 26 | --preprocessing_num_workers 128 \ 27 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 28 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 29 | --learning_rate 1e-5 \ 30 | --lr_scheduler_type linear \ 31 | --warmup_ratio 0.03 \ 32 | --weight_decay 0. \ 33 | --num_train_epochs 30 \ 34 | --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try5/ \ 35 | --with_tracking \ 36 | --report_to tensorboard \ 37 | --logging_steps 1 38 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_noins_plus_partial_try6.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | --main_process_port 29506 \ 20 | open_instruct/finetune.py \ 21 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 22 | --use_flash_attn \ 23 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 24 | --use_slow_tokenizer \ 25 | --train_file data/processed/lima/lima_noins_plus_partial.jsonl \ 26 | --max_seq_length 2048 \ 27 | --preprocessing_num_workers 128 \ 28 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 29 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 30 | --learning_rate 1e-5 \ 31 | --lr_scheduler_type linear \ 32 | --warmup_ratio 0.03 \ 33 | --weight_decay 0. \ 34 | --num_train_epochs 7 \ 35 | --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try6/ \ 36 | --with_tracking \ 37 | --report_to tensorboard \ 38 | --logging_steps 1 39 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_noins_plus_partial_try7.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | --main_process_port 29506 \ 20 | open_instruct/finetune.py \ 21 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 22 | --use_flash_attn \ 23 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 24 | --use_slow_tokenizer \ 25 | --train_file data/processed/lima/lima_noins_plus_partial.jsonl \ 26 | --max_seq_length 2048 \ 27 | --preprocessing_num_workers 128 \ 28 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 29 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 30 | --learning_rate 1e-5 \ 31 | --lr_scheduler_type linear \ 32 | --warmup_ratio 0.03 \ 33 | --weight_decay 0. \ 34 | --num_train_epochs 5 \ 35 | --output_dir output/lima_noins_plus_partial${MODEL_SIZE}try7/ \ 36 | --with_tracking \ 37 | --report_to tensorboard \ 38 | --logging_steps 1 39 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_try2.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | conda activate poi5 4 | export CUDA_VISIBLE_DEVICES=0,1 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 6 | 7 | MODEL_SIZE=7B 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | accelerate launch \ 17 | --mixed_precision bf16 \ 18 | --num_machines 1 \ 19 | --num_processes $NUM_GPUS \ 20 | --use_deepspeed \ 21 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 22 | --main_process_port 29509 \ 23 | open_instruct/finetune.py \ 24 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 25 | --use_flash_attn \ 26 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 27 | --use_slow_tokenizer \ 28 | --train_file data/processed/lima/lima_data.jsonl \ 29 | --max_seq_length 2048 \ 30 | --preprocessing_num_workers 128 \ 31 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 32 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 33 | --learning_rate 1e-5 \ 34 | --lr_scheduler_type linear \ 35 | --warmup_ratio 0.03 \ 36 | --weight_decay 0. \ 37 | --num_train_epochs 15 \ 38 | --output_dir output/lima_baseline_${MODEL_SIZE}try2/ \ 39 | --with_tracking \ 40 | --report_to tensorboard \ 41 | --logging_steps 1 42 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_try3.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | conda activate poi5 4 | export CUDA_VISIBLE_DEVICES=0,1 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 6 | 7 | MODEL_SIZE=7B 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | accelerate launch \ 17 | --mixed_precision bf16 \ 18 | --num_machines 1 \ 19 | --num_processes $NUM_GPUS \ 20 | --use_deepspeed \ 21 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 22 | --main_process_port 29509 \ 23 | open_instruct/finetune.py \ 24 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 25 | --use_flash_attn \ 26 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 27 | --use_slow_tokenizer \ 28 | --train_file data/processed/lima/lima_data.jsonl \ 29 | --max_seq_length 2048 \ 30 | --preprocessing_num_workers 128 \ 31 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 32 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 33 | --learning_rate 1e-5 \ 34 | --lr_scheduler_type linear \ 35 | --warmup_ratio 0.03 \ 36 | --weight_decay 0. \ 37 | --num_train_epochs 10 \ 38 | --output_dir output/lima_baseline_${MODEL_SIZE}try3/ \ 39 | --with_tracking \ 40 | --report_to tensorboard \ 41 | --logging_steps 1 42 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_try4.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | conda activate poi5 4 | export CUDA_VISIBLE_DEVICES=0,1 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 6 | 7 | MODEL_SIZE=7B 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | accelerate launch \ 17 | --mixed_precision bf16 \ 18 | --num_machines 1 \ 19 | --num_processes $NUM_GPUS \ 20 | --use_deepspeed \ 21 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 22 | --main_process_port 29509 \ 23 | open_instruct/finetune.py \ 24 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 25 | --use_flash_attn \ 26 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 27 | --use_slow_tokenizer \ 28 | --train_file data/processed/lima/lima_data.jsonl \ 29 | --max_seq_length 2048 \ 30 | --preprocessing_num_workers 128 \ 31 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 32 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 33 | --learning_rate 1e-5 \ 34 | --lr_scheduler_type linear \ 35 | --warmup_ratio 0.03 \ 36 | --weight_decay 0. \ 37 | --num_train_epochs 20 \ 38 | --output_dir output/lima_baseline_${MODEL_SIZE}try4/ \ 39 | --with_tracking \ 40 | --report_to tensorboard \ 41 | --logging_steps 1 42 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_try5.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1 2 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 3 | 4 | MODEL_SIZE=7B 5 | NUM_GPUS=2 6 | BATCH_SIZE_PER_GPU=1 7 | TOTAL_BATCH_SIZE=64 8 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 9 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 10 | 11 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 12 | # but it will trade off speed. 13 | accelerate launch \ 14 | --mixed_precision bf16 \ 15 | --num_machines 1 \ 16 | --num_processes $NUM_GPUS \ 17 | --use_deepspeed \ 18 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 19 | --main_process_port 29512 \ 20 | open_instruct/finetune.py \ 21 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 22 | --use_flash_attn \ 23 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 24 | --use_slow_tokenizer \ 25 | --train_file data/processed/lima/lima_data.jsonl \ 26 | --max_seq_length 2048 \ 27 | --preprocessing_num_workers 128 \ 28 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 29 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 30 | --learning_rate 1e-5 \ 31 | --lr_scheduler_type linear \ 32 | --warmup_ratio 0.03 \ 33 | --weight_decay 0. \ 34 | --num_train_epochs 30 \ 35 | --output_dir output/lima_baseline_${MODEL_SIZE}try5/ \ 36 | --with_tracking \ 37 | --report_to tensorboard \ 38 | --logging_steps 1 39 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_try6.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | conda activate poi5 4 | export CUDA_VISIBLE_DEVICES=0,1 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 6 | 7 | MODEL_SIZE=7B 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | accelerate launch \ 17 | --mixed_precision bf16 \ 18 | --num_machines 1 \ 19 | --num_processes $NUM_GPUS \ 20 | --use_deepspeed \ 21 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 22 | --main_process_port 29509 \ 23 | open_instruct/finetune.py \ 24 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 25 | --use_flash_attn \ 26 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 27 | --use_slow_tokenizer \ 28 | --train_file data/processed/lima/lima_data.jsonl \ 29 | --max_seq_length 2048 \ 30 | --preprocessing_num_workers 128 \ 31 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 32 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 33 | --learning_rate 1e-5 \ 34 | --lr_scheduler_type linear \ 35 | --warmup_ratio 0.03 \ 36 | --weight_decay 0. \ 37 | --num_train_epochs 7 \ 38 | --output_dir output/lima_baseline_${MODEL_SIZE}try6/ \ 39 | --with_tracking \ 40 | --report_to tensorboard \ 41 | --logging_steps 1 42 | -------------------------------------------------------------------------------- /scripts/sweep/7B/finetune_try7.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | conda activate poi5 4 | export CUDA_VISIBLE_DEVICES=0,1 5 | #export CUDA_VISIBLE_DEVICES=0,1,2,3 6 | 7 | MODEL_SIZE=7B 8 | NUM_GPUS=2 9 | BATCH_SIZE_PER_GPU=1 10 | TOTAL_BATCH_SIZE=64 11 | GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU)) 12 | echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps" 13 | 14 | # You can also set --gradient_checkpointing or use `stage3_offloading_accelerate.conf` to save memory, 15 | # but it will trade off speed. 16 | accelerate launch \ 17 | --mixed_precision bf16 \ 18 | --num_machines 1 \ 19 | --num_processes $NUM_GPUS \ 20 | --use_deepspeed \ 21 | --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \ 22 | --main_process_port 29509 \ 23 | open_instruct/finetune.py \ 24 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 25 | --use_flash_attn \ 26 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 27 | --use_slow_tokenizer \ 28 | --train_file data/processed/lima/lima_data.jsonl \ 29 | --max_seq_length 2048 \ 30 | --preprocessing_num_workers 128 \ 31 | --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ 32 | --gradient_accumulation_steps $GRADIENT_ACC_STEPS \ 33 | --learning_rate 1e-5 \ 34 | --lr_scheduler_type linear \ 35 | --warmup_ratio 0.03 \ 36 | --weight_decay 0. \ 37 | --num_train_epochs 5 \ 38 | --output_dir output/lima_baseline_${MODEL_SIZE}try7/ \ 39 | --with_tracking \ 40 | --report_to tensorboard \ 41 | --logging_steps 1 42 | -------------------------------------------------------------------------------- /weight-diff-requirements.txt: -------------------------------------------------------------------------------- 1 | fire 2 | torch 3 | tqdm 4 | transformers 5 | accelerate 6 | sentencepiece 7 | protobuf==3.20.0 8 | --------------------------------------------------------------------------------