├── figs
    ├── overview.png
    └── magpie_logo.png
├── requirements.txt
├── LICENSE
├── recipes
    ├── Llama-3-8B-Magpie-Align-v0.1
    │   └── Llama-3-8B-Magpie-Align-v0.1.yaml
    ├── Llama-3.1-8B-Magpie-Align-v0.1
    │   └── Llama-3.1-8B-Magpie-Align-v0.1.yaml
    ├── Llama-3-8B-Magpie-Align-v0.2
    │   └── Llama-3-8B-Magpie-Align-v0.2.yaml
    ├── Llama-3-8B-Magpie-Align-v0.3
    │   └── Llama-3-8B-Magpie-Align-v0.3.yaml
    ├── Llama-3.1-8B-Magpie-Align-v0.2
    │   └── Llama-3.1-8B-Magpie-Align-v0.2.yaml
    ├── Llama-3-8B-Magpie-Align-SFT-v0.1
    │   └── Llama-3-8B-Magpie-Align-SFT-v0.1.yaml
    ├── Llama-3-8B-Magpie-Pro-SFT-300K-v0.1
    │   └── Llama-3-8B-Magpie-Pro-SFT-300K-v0.1.yaml
    ├── Llama-3-8B-Magpie-Align-SFT-v0.2
    │   └── Llama-3-8B-Magpie-Align-SFT-v0.2.yaml
    ├── Llama-3.1-8B-Magpie-Align-SFT-v0.1
    │   └── Llama-3.1-8B-Magpie-Align-SFT-v0.1.yaml
    ├── Llama-3.1-8B-Magpie-Align-SFT-v0.2
    │   └── Llama-3.1-8B-Magpie-Align-SFT-v0.2.yaml
    ├── Llama-3-8B-Magpie-Align-SFT-v0.3
    │   └── Llama-3-8B-Magpie-Align-SFT-v0.3.yaml
    └── README.md
├── scripts
    ├── magpie-multi-turn.sh
    ├── magpie_example_po.sh
    ├── magpie-gemma7b.sh
    ├── magpie-vicuna-7b.sh
    ├── magpie-yi34b.sh
    ├── magpie.sh
    ├── magpie-llama2-7b.sh
    ├── magpie-llama2-70b.sh
    ├── magpie-llama3-8b.sh
    ├── magpie-mistral7b.sh
    ├── magpie-llama3-70b.sh
    ├── magpie_code.sh
    ├── magpie_math.sh
    ├── magpie-qwen2.5-14b.sh
    ├── magpie-qwen2.5-3b.sh
    ├── magpie-qwen2.5-7b.sh
    ├── magpie-phi3mini.sh
    ├── magpie-phi3small.sh
    ├── magpie-qwen2.5-32b.sh
    ├── magpie-qwen2.5-72b.sh
    ├── magpie-phi3medium.sh
    ├── magpie-qwen2-7b.sh
    ├── magpie-qwen2.5-32b-coder.sh
    ├── magpie-deepseek-coderv2-lite.sh
    ├── magpie-qwen2-72b.sh
    ├── magpie-qwen2-math-7b.sh
    ├── magpie_translation.sh
    ├── magpie-qwen2.5-math-72b.sh
    ├── magpie-llama3.3-70b.sh
    ├── magpie-gemma2-9b.sh
    ├── magpie-gemma2-27b.sh
    ├── magpie-llama3.1-8b.sh
    ├── magpie-llama3.1-70b.sh
    └── unitag.sh
├── data_sft
    └── data_filter.ipynb
├── exp
    ├── utils.py
    ├── gen_po_rewards.py
    ├── gen_dis.py
    ├── gen_mt.py
    ├── gen_res.py
    ├── gen_po_multi_res.py
    └── gen_ins.py
├── .gitignore
├── data_po
    └── example_instructions.jsonl
├── demo.ipynb
├── navigation.md
└── README.md


/figs/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magpie-align/magpie/HEAD/figs/overview.png


--------------------------------------------------------------------------------
/figs/magpie_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magpie-align/magpie/HEAD/figs/magpie_logo.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://flashinfer.ai/whl/cu124/torch2.4/
 2 | # LLM
 3 | # flash-attn[--no-build-isolation]
 4 | vllm==0.6.5
 5 | transformers
 6 | sentence-transformers
 7 | sentencepiece
 8 | trl
 9 | peft
10 | datasets
11 | accelerate
12 | bitsandbytes
13 | autoawq
14 | ray
15 | flashinfer
16 | 
17 | # Cloud API & Chat
18 | git+https://github.com/lm-sys/FastChat.git
19 | tenacity
20 | boto3
21 | openai
22 | google-generativeai
23 | anthropic
24 | together
25 | 
26 | # Widgets & Visualization
27 | matplotlib
28 | wandb
29 | ipywidgets
30 | ipykernel
31 | 
32 | # Utils
33 | lingua-language-detector
34 | faiss-gpu
35 | ml_collections


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 magpie-align
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/recipes/Llama-3-8B-Magpie-Align-v0.1/Llama-3-8B-Magpie-Align-v0.1.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1
 3 | torch_dtype: null
 4 | 
 5 | # Data training arguments
 6 | # For definitions, see: src/h4/training/config.py
 7 | dataset_mixer:
 8 |   princeton-nlp/llama3-ultrafeedback: 1.0
 9 | dataset_splits:
10 | - train
11 | - test
12 | preprocessing_num_workers: 12
13 | 
14 | # DPOTrainer arguments
15 | bf16: true
16 | beta: 0.01
17 | do_eval: true
18 | evaluation_strategy: steps
19 | eval_steps: 100
20 | gradient_accumulation_steps: 16
21 | gradient_checkpointing: true
22 | gradient_checkpointing_kwargs:
23 |   use_reentrant: False
24 | hub_model_id: Magpie-Align/Llama-3-8B-Magpie-Align-v0.1
25 | learning_rate: 1.0e-6
26 | log_level: info
27 | logging_steps: 1
28 | lr_scheduler_type: cosine
29 | max_length: 2048
30 | max_prompt_length: 1800
31 | num_train_epochs: 1
32 | optim: adamw_torch
33 | output_dir: data/magpie-pro-mt-ultradpo-1e-6
34 | per_device_train_batch_size: 2
35 | per_device_eval_batch_size: 4
36 | push_to_hub: true
37 | save_strategy: "steps"
38 | save_steps: 100
39 | save_total_limit: 1
40 | seed: 42
41 | warmup_ratio: 0.1
42 | 


--------------------------------------------------------------------------------
/recipes/Llama-3.1-8B-Magpie-Align-v0.1/Llama-3.1-8B-Magpie-Align-v0.1.yaml:
--------------------------------------------------------------------------------
 1 | model_name_or_path: Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1
 2 | hub_model_id: Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1
 3 | output_dir: alignment_handbook_out/Llama-3.1-8B-Magpie-Align-v0.1
 4 | run_name: Llama-3.1-8B-Magpie-Align-v0.1
 5 | 
 6 | dataset_mixer:
 7 |   princeton-nlp/llama3-ultrafeedback-armorm: 1.0
 8 | dataset_splits:
 9 | - train
10 | - test
11 | preprocessing_num_workers: 64
12 | 
13 | # DPOTrainer arguments
14 | bf16: true
15 | beta: 0.01
16 | learning_rate: 1.0e-6
17 | gradient_accumulation_steps: 16
18 | per_device_train_batch_size: 2
19 | per_device_eval_batch_size: 4
20 | num_train_epochs: 1
21 | max_length: 2048
22 | max_prompt_length: 1800
23 | warmup_ratio: 0.1
24 | logging_steps: 1
25 | lr_scheduler_type: cosine
26 | optim: adamw_torch
27 | 
28 | torch_dtype: null
29 | use_flash_attention_2: true
30 | do_eval: true
31 | evaluation_strategy: steps
32 | eval_steps: 100
33 | gradient_checkpointing: true
34 | gradient_checkpointing_kwargs:
35 |   use_reentrant: False
36 | log_level: info
37 | push_to_hub: true
38 | save_strategy: "steps"
39 | save_steps: 100
40 | save_total_limit: 1
41 | seed: 42
42 | report_to:
43 | - wandb
44 | 


--------------------------------------------------------------------------------
/recipes/Llama-3-8B-Magpie-Align-v0.2/Llama-3-8B-Magpie-Align-v0.2.yaml:
--------------------------------------------------------------------------------
 1 | # Customized Configs
 2 | model_name_or_path: Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.2
 3 | hub_model_id: Magpie-Align/Llama-3-8B-Magpie-Align-v0.2
 4 | output_dir: alignment_handbook_out/Llama-3-8B-Magpie-Align-v0.2
 5 | run_name: Llama-3-8B-Magpie-Align-v0.2
 6 | 
 7 | dataset_mixer:
 8 |   princeton-nlp/llama3-ultrafeedback-armorm: 1.0
 9 | dataset_splits:
10 | - train
11 | - test
12 | preprocessing_num_workers: 24
13 | 
14 | # DPOTrainer arguments
15 | bf16: true
16 | beta: 0.01
17 | learning_rate: 0.8e-6
18 | gradient_accumulation_steps: 8
19 | per_device_train_batch_size: 2
20 | per_device_eval_batch_size: 4
21 | num_train_epochs: 1
22 | max_length: 2048
23 | max_prompt_length: 1800
24 | warmup_ratio: 0.1
25 | logging_steps: 1
26 | lr_scheduler_type: cosine
27 | optim: adamw_torch
28 | 
29 | torch_dtype: null
30 | use_flash_attention_2: true
31 | do_eval: true
32 | evaluation_strategy: steps
33 | eval_steps: 100
34 | gradient_checkpointing: true
35 | gradient_checkpointing_kwargs:
36 |   use_reentrant: False
37 | log_level: info
38 | push_to_hub: true
39 | save_strategy: "steps"
40 | save_steps: 100
41 | save_total_limit: 1
42 | seed: 42
43 | report_to:
44 | - wandb
45 | 


--------------------------------------------------------------------------------
/recipes/Llama-3-8B-Magpie-Align-v0.3/Llama-3-8B-Magpie-Align-v0.3.yaml:
--------------------------------------------------------------------------------
 1 | # Customized Configs
 2 | model_name_or_path: Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3
 3 | hub_model_id: Magpie-Align/Llama-3-8B-Magpie-Align-v0.3
 4 | output_dir: alignment_handbook_out/Llama-3-8B-Magpie-Align-v0.3
 5 | run_name: Llama-3-8B-Magpie-Align-v0.3
 6 | 
 7 | dataset_mixer:
 8 |   princeton-nlp/llama3-ultrafeedback-armorm: 1.0
 9 | dataset_splits:
10 | - train
11 | - test
12 | preprocessing_num_workers: 24
13 | 
14 | # DPOTrainer arguments
15 | bf16: true
16 | beta: 0.01
17 | learning_rate: 0.7e-6
18 | gradient_accumulation_steps: 8
19 | per_device_train_batch_size: 2
20 | per_device_eval_batch_size: 4
21 | num_train_epochs: 1
22 | max_length: 2048
23 | max_prompt_length: 1800
24 | warmup_ratio: 0.1
25 | logging_steps: 1
26 | lr_scheduler_type: cosine
27 | optim: adamw_torch
28 | 
29 | torch_dtype: null
30 | use_flash_attention_2: true
31 | do_eval: true
32 | evaluation_strategy: steps
33 | eval_steps: 100
34 | gradient_checkpointing: true
35 | gradient_checkpointing_kwargs:
36 |   use_reentrant: False
37 | log_level: info
38 | push_to_hub: true
39 | save_strategy: "steps"
40 | save_steps: 100
41 | save_total_limit: 1
42 | seed: 42
43 | report_to:
44 | - wandb
45 | 


--------------------------------------------------------------------------------
/recipes/Llama-3.1-8B-Magpie-Align-v0.2/Llama-3.1-8B-Magpie-Align-v0.2.yaml:
--------------------------------------------------------------------------------
 1 | # Customized Configs
 2 | model_name_or_path: Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.2
 3 | hub_model_id: Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.2
 4 | output_dir: /data/zhangchen_xu/alignment_handbook_out/Llama-3.1-8B-Magpie-Align-v0.2
 5 | run_name: Llama-3.1-8B-Magpie-Align-v0.2
 6 | 
 7 | dataset_mixer:
 8 |   Magpie-Align/Llama-3.1-70B-PO-100K-armorm: 1.0
 9 | dataset_splits:
10 | - train
11 | - test
12 | preprocessing_num_workers: 64
13 | 
14 | # DPOTrainer arguments
15 | bf16: true
16 | beta: 0.01
17 | learning_rate: 0.5e-6
18 | gradient_accumulation_steps: 16
19 | per_device_train_batch_size: 2
20 | per_device_eval_batch_size: 4
21 | num_train_epochs: 1
22 | max_length: 2048
23 | max_prompt_length: 1800
24 | warmup_ratio: 0.1
25 | logging_steps: 1
26 | lr_scheduler_type: cosine
27 | optim: adamw_torch
28 | 
29 | torch_dtype: null
30 | use_flash_attention_2: true
31 | do_eval: true
32 | evaluation_strategy: steps
33 | eval_steps: 100
34 | gradient_checkpointing: true
35 | gradient_checkpointing_kwargs:
36 |   use_reentrant: False
37 | log_level: info
38 | push_to_hub: true
39 | save_strategy: "steps"
40 | save_steps: 100
41 | save_total_limit: 1
42 | seed: 42
43 | report_to:
44 | - wandb


--------------------------------------------------------------------------------
/scripts/magpie-multi-turn.sh:
--------------------------------------------------------------------------------
 1 | input_file=${1:-"none"}
 2 | num_turns=${2:-2}
 3 | device=${3:-"0"}
 4 | model_path=${4:-"meta-llama/Meta-Llama-3-8B-Instruct"}
 5 | tensor_parallel=1
 6 | gpu_memory_utilization=0.95
 7 | batch_size=128
 8 | 
 9 | if [ $input_file == "none" ]; then
10 |     echo "[magpie.sh] Input file not provided!"
11 |     exit 1
12 | fi
13 | if [ ! -f $input_file ]; then
14 |     echo "[magpie.sh] Input file not found!"
15 |     exit 1
16 | fi
17 | 
18 | # get job path from input file
19 | job_path=$(dirname "$input_file")
20 | exec > >(tee -a "$job_path/tagging.log") 2>&1
21 | echo "[magpie.sh] Job Path: $job_path"
22 | echo "[magpie.sh] Input File: $input_file"
23 | echo "[magpie.sh] Num Turns: $num_turns"
24 | echo "[magpie.sh] Model Name: $model_path"
25 | echo "[magpie.sh] System Config: device=$device, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
26 | 
27 | echo "[magpie.sh] Start Generating Multi-turn Conversations..."
28 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_mt.py \
29 |     --device $device \
30 |     --model_path $model_path \
31 |     --input_file $input_file \
32 |     --num_turns $num_turns \
33 |     --tensor_parallel $tensor_parallel \
34 |     --gpu_memory_utilization $gpu_memory_utilization \
35 |     --batch_size $batch_size \
36 | 
37 | echo "[magpie.sh] Finish Generating Multi-turn Conversations!"


--------------------------------------------------------------------------------
/recipes/Llama-3-8B-Magpie-Align-SFT-v0.1/Llama-3-8B-Magpie-Align-SFT-v0.1.yaml:
--------------------------------------------------------------------------------
 1 | base_model: meta-llama/Meta-Llama-3-8B
 2 | model_type: LlamaForCausalLM
 3 | tokenizer_type: AutoTokenizer
 4 | chat_template: llama3
 5 | 
 6 | load_in_8bit: false
 7 | load_in_4bit: false
 8 | strict: false
 9 | 
10 | datasets:
11 |   - path: Magpie-Align/Magpie-Pro-MT-300K-v0.1
12 |     type: sharegpt
13 |     conversation: llama3
14 | dataset_prepared_path: last_run_prepared
15 | val_set_size: 0.001
16 | output_dir: ./Llama-3-8B-Magpie-Align-SFT-v0.1
17 | 
18 | sequence_len: 8192
19 | sample_packing: true
20 | eval_sample_packing: false
21 | pad_to_sequence_len: true
22 | 
23 | hub_model_id: Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1
24 | 
25 | gradient_accumulation_steps: 8
26 | micro_batch_size: 1
27 | num_epochs: 2
28 | optimizer: paged_adamw_8bit
29 | lr_scheduler: cosine
30 | learning_rate: 2e-5
31 | 
32 | train_on_inputs: false
33 | group_by_length: false
34 | bf16: auto
35 | fp16:
36 | tf32: false
37 | 
38 | gradient_checkpointing: true
39 | gradient_checkpointing_kwargs:
40 |   use_reentrant: false
41 | early_stopping_patience:
42 | resume_from_checkpoint:
43 | logging_steps: 1
44 | xformers_attention:
45 | flash_attention: true
46 | 
47 | warmup_steps: 100
48 | evals_per_epoch: 5
49 | eval_table_size:
50 | saves_per_epoch: 1
51 | debug:
52 | deepspeed:
53 | weight_decay: 0.0
54 | fsdp:
55 | fsdp_config:
56 | special_tokens:
57 |   pad_token: <|end_of_text|>
58 | 


--------------------------------------------------------------------------------
/recipes/Llama-3-8B-Magpie-Pro-SFT-300K-v0.1/Llama-3-8B-Magpie-Pro-SFT-300K-v0.1.yaml:
--------------------------------------------------------------------------------
 1 | base_model: meta-llama/Meta-Llama-3-8B
 2 | model_type: LlamaForCausalLM
 3 | tokenizer_type: AutoTokenizer
 4 | chat_template: llama3
 5 | 
 6 | load_in_8bit: false
 7 | load_in_4bit: false
 8 | strict: false
 9 | 
10 | datasets:
11 |   - path: Magpie-Align/Magpie-Pro-300K-Filtered
12 |     type: sharegpt
13 |     conversation: llama3
14 | dataset_prepared_path: last_run_prepared
15 | val_set_size: 0.001
16 | output_dir: ./out_Llama-3-8B-Magpie-Pro-300K-v0.1
17 | 
18 | sequence_len: 8192
19 | sample_packing: true
20 | eval_sample_packing: false
21 | pad_to_sequence_len: true
22 | 
23 | wandb_project:
24 | wandb_entity:
25 | wandb_watch:
26 | wandb_name:
27 | wandb_log_model:
28 | hub_model_id:
29 | 
30 | gradient_accumulation_steps: 8
31 | micro_batch_size: 1
32 | num_epochs: 2
33 | optimizer: paged_adamw_8bit
34 | lr_scheduler: cosine
35 | learning_rate: 2e-5
36 | 
37 | train_on_inputs: false
38 | group_by_length: false
39 | bf16: auto
40 | fp16:
41 | tf32: false
42 | 
43 | gradient_checkpointing: true
44 | gradient_checkpointing_kwargs:
45 |   use_reentrant: false
46 | early_stopping_patience:
47 | resume_from_checkpoint:
48 | logging_steps: 1
49 | xformers_attention:
50 | flash_attention: true
51 | 
52 | warmup_steps: 100
53 | evals_per_epoch: 1
54 | eval_table_size:
55 | saves_per_epoch: 3
56 | debug:
57 | deepspeed:
58 | weight_decay: 0.0
59 | fsdp:
60 | fsdp_config:
61 | special_tokens:
62 |   pad_token: <|end_of_text|>
63 | 


--------------------------------------------------------------------------------
/recipes/Llama-3-8B-Magpie-Align-SFT-v0.2/Llama-3-8B-Magpie-Align-SFT-v0.2.yaml:
--------------------------------------------------------------------------------
 1 | base_model: meta-llama/Meta-Llama-3-8B
 2 | model_type: LlamaForCausalLM
 3 | tokenizer_type: AutoTokenizer
 4 | chat_template: llama3
 5 | 
 6 | load_in_8bit: false
 7 | load_in_4bit: false
 8 | strict: false
 9 | 
10 | datasets:
11 |   - path: Magpie-Align/Magpie-Reasoning-150K
12 |     type: sharegpt
13 |     conversation: llama3
14 |   - path: Magpie-Align/Magpie-Pro-MT-300K-v0.1
15 |     type: sharegpt
16 |     conversation: llama3
17 | dataset_prepared_path: last_run_prepared
18 | val_set_size: 0.001
19 | output_dir: axolotl_out/Llama-3-8B-Magpie-Align-SFT-v0.2
20 | 
21 | sequence_len: 8192
22 | sample_packing: true
23 | eval_sample_packing: false
24 | pad_to_sequence_len: true
25 | 
26 | hub_model_id: Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.2
27 | 
28 | gradient_accumulation_steps: 32
29 | micro_batch_size: 1
30 | num_epochs: 2
31 | optimizer: paged_adamw_8bit
32 | lr_scheduler: cosine
33 | learning_rate: 2e-5
34 | 
35 | train_on_inputs: false
36 | group_by_length: false
37 | bf16: auto
38 | fp16:
39 | tf32: false
40 | 
41 | gradient_checkpointing: true
42 | gradient_checkpointing_kwargs:
43 |   use_reentrant: false
44 | early_stopping_patience:
45 | resume_from_checkpoint:
46 | logging_steps: 1
47 | xformers_attention:
48 | flash_attention: true
49 | 
50 | warmup_ratio: 0.1
51 | evals_per_epoch: 5
52 | eval_table_size:
53 | saves_per_epoch: 1
54 | debug:
55 | deepspeed:
56 | weight_decay: 0.0
57 | fsdp:
58 | fsdp_config:
59 | special_tokens:
60 |   pad_token: <|end_of_text|>
61 | 


--------------------------------------------------------------------------------
/recipes/Llama-3.1-8B-Magpie-Align-SFT-v0.1/Llama-3.1-8B-Magpie-Align-SFT-v0.1.yaml:
--------------------------------------------------------------------------------
 1 | base_model: meta-llama/Meta-Llama-3.1-8B
 2 | model_type: LlamaForCausalLM
 3 | tokenizer_type: AutoTokenizer
 4 | chat_template: llama3
 5 | 
 6 | load_in_8bit: false
 7 | load_in_4bit: false
 8 | strict: false
 9 | 
10 | datasets:
11 |   - path: Magpie-Align/Magpie-Reasoning-150K
12 |     type: sharegpt
13 |     conversation: llama3
14 |   - path: Magpie-Align/Magpie-Pro-MT-300K-v0.1
15 |     type: sharegpt
16 |     conversation: llama3
17 | dataset_prepared_path: last_run_prepared
18 | val_set_size: 0.001
19 | output_dir: axolotl_out/Llama-3.1-8B-Magpie-Align-SFT-v0.1
20 | 
21 | sequence_len: 8192
22 | sample_packing: true
23 | eval_sample_packing: false
24 | pad_to_sequence_len: true
25 | 
26 | hub_model_id: Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1
27 | 
28 | gradient_accumulation_steps: 16
29 | micro_batch_size: 1
30 | num_epochs: 2
31 | optimizer: paged_adamw_8bit
32 | lr_scheduler: cosine
33 | learning_rate: 2e-5
34 | 
35 | train_on_inputs: false
36 | group_by_length: false
37 | bf16: auto
38 | fp16:
39 | tf32: false
40 | 
41 | gradient_checkpointing: true
42 | gradient_checkpointing_kwargs:
43 |   use_reentrant: false
44 | early_stopping_patience:
45 | resume_from_checkpoint:
46 | logging_steps: 1
47 | xformers_attention:
48 | flash_attention: true
49 | 
50 | warmup_ratio: 0.1
51 | evals_per_epoch: 5
52 | eval_table_size:
53 | saves_per_epoch: 1
54 | debug:
55 | deepspeed:
56 | weight_decay: 0.0
57 | fsdp:
58 | fsdp_config:
59 | special_tokens:
60 |   pad_token: <|end_of_text|>
61 | 


--------------------------------------------------------------------------------
/recipes/Llama-3.1-8B-Magpie-Align-SFT-v0.2/Llama-3.1-8B-Magpie-Align-SFT-v0.2.yaml:
--------------------------------------------------------------------------------
 1 | base_model: meta-llama/Meta-Llama-3.1-8B
 2 | model_type: LlamaForCausalLM
 3 | tokenizer_type: AutoTokenizer
 4 | chat_template: llama3
 5 | 
 6 | load_in_8bit: false
 7 | load_in_4bit: false
 8 | strict: false
 9 | 
10 | datasets:
11 |   - path: Magpie-Align/Magpie-Reasoning-150K
12 |     type: sharegpt
13 |     conversation: llama3
14 |   - path: Magpie-Align/Magpie-Llama-3.1-Pro-500K-Filtered
15 |     type: sharegpt
16 |     conversation: llama3
17 | dataset_prepared_path: last_run_prepared
18 | val_set_size: 0.001
19 | output_dir: saves/Llama-3.1-8B-Magpie-Align-SFT-v0.2
20 | 
21 | sequence_len: 8192
22 | sample_packing: true
23 | eval_sample_packing: false
24 | pad_to_sequence_len: true
25 | 
26 | hub_model_id: Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.2
27 | 
28 | gradient_accumulation_steps: 32
29 | micro_batch_size: 1
30 | num_epochs: 2
31 | optimizer: paged_adamw_8bit
32 | lr_scheduler: cosine
33 | learning_rate: 2e-5
34 | 
35 | train_on_inputs: false
36 | group_by_length: false
37 | bf16: auto
38 | fp16:
39 | tf32: false
40 | 
41 | gradient_checkpointing: true
42 | gradient_checkpointing_kwargs:
43 |   use_reentrant: false
44 | early_stopping_patience:
45 | resume_from_checkpoint:
46 | logging_steps: 1
47 | xformers_attention:
48 | flash_attention: true
49 | 
50 | warmup_ratio: 0.1
51 | evals_per_epoch: 5
52 | eval_table_size:
53 | saves_per_epoch: 1
54 | debug:
55 | deepspeed:
56 | weight_decay: 0.0
57 | fsdp:
58 | fsdp_config:
59 | special_tokens:
60 |   pad_token: <|end_of_text|>
61 | 


--------------------------------------------------------------------------------
/recipes/Llama-3-8B-Magpie-Align-SFT-v0.3/Llama-3-8B-Magpie-Align-SFT-v0.3.yaml:
--------------------------------------------------------------------------------
 1 | base_model: meta-llama/Meta-Llama-3-8B
 2 | model_type: LlamaForCausalLM
 3 | tokenizer_type: AutoTokenizer
 4 | chat_template: llama3
 5 | 
 6 | load_in_8bit: false
 7 | load_in_4bit: false
 8 | strict: false
 9 | 
10 | datasets:
11 |   - path: Magpie-Align/Magpie-Reasoning-150K
12 |     type: sharegpt
13 |     conversation: llama3
14 |   - path: Magpie-Align/Magpie-Qwen2-Pro-200K-Chinese
15 |     type: sharegpt
16 |     conversation: llama3
17 |   - path: Magpie-Align/Magpie-Pro-MT-300K-v0.1
18 |     type: sharegpt
19 |     conversation: llama3
20 | dataset_prepared_path: last_run_prepared
21 | val_set_size: 0.001
22 | output_dir: axolotl_out/Llama-3-8B-Magpie-Align-SFT-v0.3
23 | 
24 | sequence_len: 8192
25 | sample_packing: true
26 | eval_sample_packing: false
27 | pad_to_sequence_len: true
28 | 
29 | hub_model_id: Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3
30 | 
31 | gradient_accumulation_steps: 32
32 | micro_batch_size: 1
33 | num_epochs: 2
34 | optimizer: paged_adamw_8bit
35 | lr_scheduler: cosine
36 | learning_rate: 2e-5
37 | 
38 | train_on_inputs: false
39 | group_by_length: false
40 | bf16: auto
41 | fp16:
42 | tf32: false
43 | 
44 | gradient_checkpointing: true
45 | gradient_checkpointing_kwargs:
46 |   use_reentrant: false
47 | early_stopping_patience:
48 | resume_from_checkpoint:
49 | logging_steps: 1
50 | xformers_attention:
51 | flash_attention: true
52 | 
53 | warmup_ratio: 0.1
54 | evals_per_epoch: 5
55 | eval_table_size:
56 | saves_per_epoch: 1
57 | debug:
58 | deepspeed:
59 | weight_decay: 0.0
60 | fsdp:
61 | fsdp_config:
62 | special_tokens:
63 |   pad_token: <|end_of_text|>
64 | 


--------------------------------------------------------------------------------
/scripts/magpie_example_po.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Meta-Llama-3-8B-Instruct"}
 2 | input_file=${2:-"../data_po/example_instructions.jsonl"}
 3 | num_samples=${3:-5}
 4 | res_topp=${4:-1}
 5 | res_temp=${5:-0.8}
 6 | res_rep=1
 7 | device="0"
 8 | tensor_parallel=1
 9 | gpu_memory_utilization=0.95
10 | n=200
11 | batch_size=200
12 | 
13 | # Get Current Time
14 | timestamp=$(date +%s)
15 | 
16 | # Generate Pretty Name
17 | job_name="${input_file##*/}_topp${ins_topp}_temp${ins_temp}_PO"
18 | 
19 | ### Setup Logging
20 | log_dir="data_po"
21 | if [ ! -d "../${log_dir}" ]; then
22 |     mkdir -p "../${log_dir}"
23 | fi
24 | 
25 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
26 | echo "[magpie.sh] Model Name: $model_path"
27 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
28 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
29 | echo "[magpie.sh] Timestamp: $timestamp"
30 | echo "[magpie.sh] Job Name: $job_name"
31 | 
32 | echo "[magpie.sh] Start Generating Responses..."
33 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_po_multi_res.py \
34 |     --device $device \
35 |     --input_file $input_file \
36 |     --model_path $model_path \
37 |     --num_samples $num_samples \
38 |     --batch_size $batch_size \
39 |     --top_p $res_topp \
40 |     --temperature $res_temp \
41 |     --repetition_penalty $res_rep \
42 |     --tensor_parallel $tensor_parallel \
43 |     --gpu_memory_utilization $gpu_memory_utilization \
44 |     --offline
45 | 
46 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_po_rewards.py \
47 |     --device $device \
48 |     --input_file "${input_file%.jsonl}_${num_samples}res.json" \
49 | 
50 | echo "[magpie.sh] Finish Generating Responses!"


--------------------------------------------------------------------------------
/scripts/magpie-gemma7b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"google/gemma-1.1-7b-it"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline
66 | 
67 | echo "[magpie.sh] Finish Generating Responses!"
68 | 


--------------------------------------------------------------------------------
/scripts/magpie-vicuna-7b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"lmsys/vicuna-7b-v1.5"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline
66 | 
67 | echo "[magpie.sh] Finish Generating Responses!"
68 | 


--------------------------------------------------------------------------------
/scripts/magpie-yi34b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"01-ai/Yi-1.5-34B-Chat"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0,1,2,3"
 9 | tensor_parallel=4
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline
66 | 
67 | echo "[magpie.sh] Finish Generating Responses!"
68 | 


--------------------------------------------------------------------------------
/scripts/magpie.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Meta-Llama-3-8B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline
66 | 
67 | echo "[magpie.sh] Finish Generating Responses!"
68 | 


--------------------------------------------------------------------------------
/scripts/magpie-llama2-7b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Llama-2-7b-chat-hf"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline
66 | 
67 | echo "[magpie.sh] Finish Generating Responses!"
68 | 


--------------------------------------------------------------------------------
/scripts/magpie-llama2-70b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Llama-2-70b-chat-hf"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0,1,2,3"
 9 | tensor_parallel=4
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline
66 | 
67 | echo "[magpie.sh] Finish Generating Responses!"
68 | 


--------------------------------------------------------------------------------
/scripts/magpie-llama3-8b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Meta-Llama-3-8B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline
66 | 
67 | echo "[magpie.sh] Finish Generating Responses!"
68 | 


--------------------------------------------------------------------------------
/scripts/magpie-mistral7b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"mistralai/Mistral-7B-Instruct-v0.3"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline
66 | 
67 | echo "[magpie.sh] Finish Generating Responses!"
68 | 


--------------------------------------------------------------------------------
/scripts/magpie-llama3-70b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Meta-Llama-3-70B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0,1,2,3"
 9 | tensor_parallel=4
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline
66 | 
67 | echo "[magpie.sh] Finish Generating Responses!"
68 | 


--------------------------------------------------------------------------------
/scripts/magpie_code.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Meta-Llama-3-8B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --control_tasks code \
49 |     --n $n \
50 |     --job_name $job_name \
51 |     --timestamp $timestamp
52 | 
53 | echo "[magpie.sh] Finish Generating Instructions!"
54 | 
55 | echo "[magpie.sh] Start Generating Responses..."
56 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
57 |     --device $device \
58 |     --model_path $model_path \
59 |     --batch_size $batch_size \
60 |     --top_p $res_topp \
61 |     --temperature $res_temp \
62 |     --repetition_penalty $res_rep \
63 |     --tensor_parallel $tensor_parallel \
64 |     --gpu_memory_utilization $gpu_memory_utilization \
65 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
66 |     --offline
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie_math.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Meta-Llama-3-8B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --control_tasks math \
49 |     --n $n \
50 |     --job_name $job_name \
51 |     --timestamp $timestamp
52 | 
53 | echo "[magpie.sh] Finish Generating Instructions!"
54 | 
55 | echo "[magpie.sh] Start Generating Responses..."
56 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
57 |     --device $device \
58 |     --model_path $model_path \
59 |     --batch_size $batch_size \
60 |     --top_p $res_topp \
61 |     --temperature $res_temp \
62 |     --repetition_penalty $res_rep \
63 |     --tensor_parallel $tensor_parallel \
64 |     --gpu_memory_utilization $gpu_memory_utilization \
65 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
66 |     --offline
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-qwen2.5-14b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"Qwen/Qwen2.5-14B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-0.7}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0,1"
 9 | tensor_parallel=2
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --use_tokenizer_template \
66 |     --offline
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-qwen2.5-3b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"Qwen/Qwen2.5-3B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-0.7}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="1"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --use_tokenizer_template \
66 |     --offline
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-qwen2.5-7b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"Qwen/Qwen2.5-7B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-0.7}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --use_tokenizer_template \
66 |     --offline
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-phi3mini.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"microsoft/Phi-3-mini-128k-instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline \
66 |     --use_tokenizer_template
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-phi3small.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"microsoft/Phi-3-small-128k-instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline \
66 |     --use_tokenizer_template
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-qwen2.5-32b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"Qwen/Qwen2.5-32B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-0.6}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0,1,2,3"
 9 | tensor_parallel=4
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --use_tokenizer_template \
66 |     --offline
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-qwen2.5-72b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"Qwen/Qwen2.5-72B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-0.6}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0,1,2,3"
 9 | tensor_parallel=4
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --use_tokenizer_template \
66 |     --offline
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-phi3medium.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"microsoft/Phi-3-medium-128k-instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline \
66 |     --use_tokenizer_template
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-qwen2-7b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"Qwen/Qwen2-7B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | 
 8 | # Constants
 9 | res_rep=1
10 | device="0"
11 | tensor_parallel=1
12 | gpu_memory_utilization=0.95
13 | n=200
14 | batch_size=200
15 | 
16 | # Get Current Time
17 | timestamp=$(date +%s)
18 | 
19 | # Generate Pretty Name
20 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
21 | 
22 | ### Setup Logging
23 | log_dir="data"
24 | if [ ! -d "../${log_dir}" ]; then
25 |     mkdir -p "../${log_dir}"
26 | fi
27 | 
28 | job_path="../${log_dir}/${job_name}"
29 | 
30 | mkdir -p $job_path
31 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
32 | echo "[magpie.sh] Model Name: $model_path"
33 | echo "[magpie.sh] Pretty name: $job_name"
34 | echo "[magpie.sh] Total Prompts: $total_prompts"
35 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
36 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
37 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
38 | echo "[magpie.sh] Timestamp: $timestamp"
39 | echo "[magpie.sh] Job Name: $job_name"
40 | 
41 | echo "[magpie.sh] Start Generating Instructions..."
42 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
43 |     --device $device \
44 |     --model_path $model_path \
45 |     --total_prompts $total_prompts \
46 |     --top_p $ins_topp \
47 |     --temperature $ins_temp \
48 |     --tensor_parallel $tensor_parallel \
49 |     --gpu_memory_utilization $gpu_memory_utilization \
50 |     --n $n \
51 |     --job_name $job_name \
52 |     --timestamp $timestamp
53 | 
54 | echo "[magpie.sh] Finish Generating Instructions!"
55 | 
56 | echo "[magpie.sh] Start Generating Responses..."
57 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
58 |     --device $device \
59 |     --model_path $model_path \
60 |     --batch_size $batch_size \
61 |     --top_p $res_topp \
62 |     --temperature $res_temp \
63 |     --repetition_penalty $res_rep \
64 |     --tensor_parallel $tensor_parallel \
65 |     --gpu_memory_utilization $gpu_memory_utilization \
66 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
67 |     --use_tokenizer_template \
68 |     --offline
69 | 
70 | echo "[magpie.sh] Finish Generating Responses!"
71 | 


--------------------------------------------------------------------------------
/scripts/magpie-qwen2.5-32b-coder.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"Qwen/Qwen2.5-Coder-32B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0,1,2,3"
 9 | tensor_parallel=4
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --use_tokenizer_template \
66 |     --offline
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-deepseek-coderv2-lite.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp
51 | 
52 | echo "[magpie.sh] Finish Generating Instructions!"
53 | 
54 | echo "[magpie.sh] Start Generating Responses..."
55 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
56 |     --device $device \
57 |     --model_path $model_path \
58 |     --batch_size $batch_size \
59 |     --top_p $res_topp \
60 |     --temperature $res_temp \
61 |     --repetition_penalty $res_rep \
62 |     --tensor_parallel $tensor_parallel \
63 |     --gpu_memory_utilization $gpu_memory_utilization \
64 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
65 |     --offline \
66 |     --use_tokenizer_template
67 | 
68 | echo "[magpie.sh] Finish Generating Responses!"
69 | 


--------------------------------------------------------------------------------
/scripts/magpie-qwen2-72b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"Qwen/Qwen2-72B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | 
 8 | # Constants
 9 | res_rep=1
10 | device="0,1,2,3"
11 | tensor_parallel=4
12 | gpu_memory_utilization=0.95
13 | n=200
14 | batch_size=200
15 | 
16 | # Get Current Time
17 | timestamp=$(date +%s)
18 | 
19 | # Generate Pretty Name
20 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
21 | 
22 | ### Setup Logging
23 | log_dir="data"
24 | if [ ! -d "../${log_dir}" ]; then
25 |     mkdir -p "../${log_dir}"
26 | fi
27 | 
28 | job_path="../${log_dir}/${job_name}"
29 | 
30 | mkdir -p $job_path
31 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
32 | echo "[magpie.sh] Model Name: $model_path"
33 | echo "[magpie.sh] Pretty name: $job_name"
34 | echo "[magpie.sh] Total Prompts: $total_prompts"
35 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
36 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
37 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
38 | echo "[magpie.sh] Timestamp: $timestamp"
39 | echo "[magpie.sh] Job Name: $job_name"
40 | 
41 | echo "[magpie.sh] Start Generating Instructions..."
42 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
43 |     --device $device \
44 |     --model_path $model_path \
45 |     --total_prompts $total_prompts \
46 |     --top_p $ins_topp \
47 |     --temperature $ins_temp \
48 |     --tensor_parallel $tensor_parallel \
49 |     --gpu_memory_utilization $gpu_memory_utilization \
50 |     --n $n \
51 |     --job_name $job_name \
52 |     --timestamp $timestamp
53 | 
54 | echo "[magpie.sh] Finish Generating Instructions!"
55 | 
56 | echo "[magpie.sh] Start Generating Responses..."
57 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
58 |     --device $device \
59 |     --model_path $model_path \
60 |     --batch_size $batch_size \
61 |     --top_p $res_topp \
62 |     --temperature $res_temp \
63 |     --repetition_penalty $res_rep \
64 |     --tensor_parallel $tensor_parallel \
65 |     --gpu_memory_utilization $gpu_memory_utilization \
66 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
67 |     --use_tokenizer_template \
68 |     --offline
69 | 
70 | echo "[magpie.sh] Finish Generating Responses!"
71 | 


--------------------------------------------------------------------------------
/scripts/magpie-qwen2-math-7b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"Qwen/Qwen2-Math-7B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | 
 8 | # Constants
 9 | res_rep=1
10 | device="0"
11 | tensor_parallel=1
12 | gpu_memory_utilization=0.95
13 | n=200
14 | batch_size=200
15 | 
16 | # Get Current Time
17 | timestamp=$(date +%s)
18 | 
19 | # Generate Pretty Name
20 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
21 | 
22 | ### Setup Logging
23 | log_dir="data"
24 | if [ ! -d "../${log_dir}" ]; then
25 |     mkdir -p "../${log_dir}"
26 | fi
27 | 
28 | job_path="../${log_dir}/${job_name}"
29 | 
30 | mkdir -p $job_path
31 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
32 | echo "[magpie.sh] Model Name: $model_path"
33 | echo "[magpie.sh] Pretty name: $job_name"
34 | echo "[magpie.sh] Total Prompts: $total_prompts"
35 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
36 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
37 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
38 | echo "[magpie.sh] Timestamp: $timestamp"
39 | echo "[magpie.sh] Job Name: $job_name"
40 | 
41 | echo "[magpie.sh] Start Generating Instructions..."
42 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
43 |     --device $device \
44 |     --model_path $model_path \
45 |     --total_prompts $total_prompts \
46 |     --top_p $ins_topp \
47 |     --temperature $ins_temp \
48 |     --tensor_parallel $tensor_parallel \
49 |     --gpu_memory_utilization $gpu_memory_utilization \
50 |     --n $n \
51 |     --job_name $job_name \
52 |     --timestamp $timestamp
53 | 
54 | echo "[magpie.sh] Finish Generating Instructions!"
55 | 
56 | echo "[magpie.sh] Start Generating Responses..."
57 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
58 |     --device $device \
59 |     --model_path $model_path \
60 |     --batch_size $batch_size \
61 |     --top_p $res_topp \
62 |     --temperature $res_temp \
63 |     --repetition_penalty $res_rep \
64 |     --tensor_parallel $tensor_parallel \
65 |     --gpu_memory_utilization $gpu_memory_utilization \
66 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
67 |     --use_tokenizer_template \
68 |     --offline
69 | 
70 | echo "[magpie.sh] Finish Generating Responses!"
71 | 


--------------------------------------------------------------------------------
/scripts/magpie_translation.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Meta-Llama-3-8B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --control_tasks translation \
49 |     --n $n \
50 |     --job_name $job_name \
51 |     --timestamp $timestamp \
52 |     --disable_early_stopping
53 | 
54 | echo "[magpie.sh] Finish Generating Instructions!"
55 | 
56 | echo "[magpie.sh] Start Generating Responses..."
57 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
58 |     --device $device \
59 |     --model_path $model_path \
60 |     --batch_size $batch_size \
61 |     --top_p $res_topp \
62 |     --temperature $res_temp \
63 |     --repetition_penalty $res_rep \
64 |     --tensor_parallel $tensor_parallel \
65 |     --gpu_memory_utilization $gpu_memory_utilization \
66 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
67 |     --offline
68 | 
69 | echo "[magpie.sh] Finish Generating Responses!"
70 | 


--------------------------------------------------------------------------------
/scripts/magpie-qwen2.5-math-72b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"Qwen/Qwen2.5-Math-72B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-0.9}
 4 | ins_temp=${4:-0.6}
 5 | res_topp=${5:-0.9}
 6 | res_temp=${6:-0.6}
 7 | res_rep=1
 8 | device="0,1,2,3"
 9 | tensor_parallel=4
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --flaming_tokens \
49 |     --n $n \
50 |     --job_name $job_name \
51 |     --timestamp $timestamp
52 | 
53 | echo "[magpie.sh] Finish Generating Instructions!"
54 | 
55 | echo "[magpie.sh] Start Generating Responses..."
56 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
57 |     --device $device \
58 |     --model_path $model_path \
59 |     --batch_size $batch_size \
60 |     --top_p $res_topp \
61 |     --temperature $res_temp \
62 |     --repetition_penalty $res_rep \
63 |     --tensor_parallel $tensor_parallel \
64 |     --gpu_memory_utilization $gpu_memory_utilization \
65 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
66 |     --use_tokenizer_template \
67 |     --offline
68 | 
69 | echo "[magpie.sh] Finish Generating Responses!"
70 | 


--------------------------------------------------------------------------------
/scripts/magpie-llama3.3-70b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Llama-3.3-70B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0,1,2,3"
 9 | tensor_parallel=4
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --disable_early_stopping \
49 |     --sanitize \
50 |     --n $n \
51 |     --job_name $job_name \
52 |     --timestamp $timestamp \
53 |     --max_tokens 1024
54 | 
55 | echo "[magpie.sh] Finish Generating Instructions!"
56 | 
57 | echo "[magpie.sh] Start Generating Responses..."
58 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
59 |     --device $device \
60 |     --model_path $model_path \
61 |     --batch_size $batch_size \
62 |     --top_p $res_topp \
63 |     --temperature $res_temp \
64 |     --repetition_penalty $res_rep \
65 |     --tensor_parallel $tensor_parallel \
66 |     --gpu_memory_utilization $gpu_memory_utilization \
67 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
68 |     --offline
69 | 
70 | echo "[magpie.sh] Finish Generating Responses!"
71 | 


--------------------------------------------------------------------------------
/scripts/magpie-gemma2-9b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"google/gemma-2-9b-it"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=10
12 | batch_size=2
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp \
51 |     --disable_early_stopping \
52 |     --sanitize \
53 |     --engine hf \
54 |     --max_tokens 512
55 | 
56 | echo "[magpie.sh] Finish Generating Instructions!"
57 | 
58 | echo "[magpie.sh] Start Generating Responses..."
59 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
60 |     --device $device \
61 |     --model_path $model_path \
62 |     --batch_size $batch_size \
63 |     --top_p $res_topp \
64 |     --temperature $res_temp \
65 |     --repetition_penalty $res_rep \
66 |     --tensor_parallel $tensor_parallel \
67 |     --gpu_memory_utilization $gpu_memory_utilization \
68 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
69 |     --engine hf \
70 |     --max_tokens 2048
71 | 
72 | echo "[magpie.sh] Finish Generating Responses!"
73 | 


--------------------------------------------------------------------------------
/scripts/magpie-gemma2-27b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"google/gemma-2-27b-it"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-1}
 5 | res_topp=${5:-1}
 6 | res_temp=${6:-0}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=10
12 | batch_size=2
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --n $n \
49 |     --job_name $job_name \
50 |     --timestamp $timestamp \
51 |     --disable_early_stopping \
52 |     --sanitize \
53 |     --engine hf \
54 |     --max_tokens 512
55 | 
56 | echo "[magpie.sh] Finish Generating Instructions!"
57 | 
58 | echo "[magpie.sh] Start Generating Responses..."
59 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
60 |     --device $device \
61 |     --model_path $model_path \
62 |     --batch_size $batch_size \
63 |     --top_p $res_topp \
64 |     --temperature $res_temp \
65 |     --repetition_penalty $res_rep \
66 |     --tensor_parallel $tensor_parallel \
67 |     --gpu_memory_utilization $gpu_memory_utilization \
68 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
69 |     --engine hf \
70 |     --max_tokens 2048
71 | 
72 | echo "[magpie.sh] Finish Generating Responses!"
73 | 


--------------------------------------------------------------------------------
/scripts/magpie-llama3.1-8b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Meta-Llama-3.1-8B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-0.8}
 5 | res_topp=${5:-0.9}
 6 | res_temp=${6:-0.6}
 7 | res_rep=1
 8 | device="0"
 9 | tensor_parallel=1
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --disable_early_stopping \
49 |     --sanitize \
50 |     --logits_processor \
51 |     --n $n \
52 |     --job_name $job_name \
53 |     --timestamp $timestamp \
54 |     --max_tokens 1024
55 | 
56 | echo "[magpie.sh] Finish Generating Instructions!"
57 | 
58 | echo "[magpie.sh] Start Generating Responses..."
59 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
60 |     --device $device \
61 |     --model_path $model_path \
62 |     --batch_size $batch_size \
63 |     --top_p $res_topp \
64 |     --temperature $res_temp \
65 |     --repetition_penalty $res_rep \
66 |     --tensor_parallel $tensor_parallel \
67 |     --gpu_memory_utilization $gpu_memory_utilization \
68 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
69 |     --offline
70 | 
71 | echo "[magpie.sh] Finish Generating Responses!"
72 | 


--------------------------------------------------------------------------------
/scripts/magpie-llama3.1-70b.sh:
--------------------------------------------------------------------------------
 1 | model_path=${1:-"meta-llama/Meta-Llama-3.1-70B-Instruct"}
 2 | total_prompts=${2:-1000}
 3 | ins_topp=${3:-1}
 4 | ins_temp=${4:-0.9}
 5 | res_topp=${5:-0.9}
 6 | res_temp=${6:-0.6}
 7 | res_rep=1
 8 | device="0,1,2,3"
 9 | tensor_parallel=4
10 | gpu_memory_utilization=0.95
11 | n=200
12 | batch_size=200
13 | 
14 | # Get Current Time
15 | timestamp=$(date +%s)
16 | 
17 | # Generate Pretty Name
18 | job_name="${model_path##*/}_topp${ins_topp}_temp${ins_temp}_${timestamp}"
19 | 
20 | ### Setup Logging
21 | log_dir="data"
22 | if [ ! -d "../${log_dir}" ]; then
23 |     mkdir -p "../${log_dir}"
24 | fi
25 | 
26 | job_path="../${log_dir}/${job_name}"
27 | 
28 | mkdir -p $job_path
29 | exec > >(tee -a "$job_path/${job_name}.log") 2>&1
30 | echo "[magpie.sh] Model Name: $model_path"
31 | echo "[magpie.sh] Pretty name: $job_name"
32 | echo "[magpie.sh] Total Prompts: $total_prompts"
33 | echo "[magpie.sh] Instruction Generation Config: temp=$ins_temp, top_p=$ins_topp"
34 | echo "[magpie.sh] Response Generation Config: temp=$res_temp, top_p=$res_topp, rep=$res_rep"
35 | echo "[magpie.sh] System Config: device=$device, n=$n, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
36 | echo "[magpie.sh] Timestamp: $timestamp"
37 | echo "[magpie.sh] Job Name: $job_name"
38 | 
39 | echo "[magpie.sh] Start Generating Instructions..."
40 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_ins.py \
41 |     --device $device \
42 |     --model_path $model_path \
43 |     --total_prompts $total_prompts \
44 |     --top_p $ins_topp \
45 |     --temperature $ins_temp \
46 |     --tensor_parallel $tensor_parallel \
47 |     --gpu_memory_utilization $gpu_memory_utilization \
48 |     --disable_early_stopping \
49 |     --sanitize \
50 |     --logits_processor \
51 |     --n $n \
52 |     --job_name $job_name \
53 |     --timestamp $timestamp \
54 |     --max_tokens 1024
55 | 
56 | echo "[magpie.sh] Finish Generating Instructions!"
57 | 
58 | echo "[magpie.sh] Start Generating Responses..."
59 | CUDA_VISIBLE_DEVICES=$device python ../exp/gen_res.py \
60 |     --device $device \
61 |     --model_path $model_path \
62 |     --batch_size $batch_size \
63 |     --top_p $res_topp \
64 |     --temperature $res_temp \
65 |     --repetition_penalty $res_rep \
66 |     --tensor_parallel $tensor_parallel \
67 |     --gpu_memory_utilization $gpu_memory_utilization \
68 |     --input_file $job_path/Magpie_${model_path##*/}_${total_prompts}_${timestamp}_ins.json \
69 |     --offline
70 | 
71 | echo "[magpie.sh] Finish Generating Responses!"
72 | 


--------------------------------------------------------------------------------
/data_sft/data_filter.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from datasets import load_dataset\n",
10 |     "\n",
11 |     "dataset_path = \"Qwen2-Magpie-Pro-1M-v0.1.jsonl\" # Put your path to the dataset here\n",
12 |     "dataset = load_dataset(\"json\", data_files=dataset_path)\n",
13 |     "print(dataset)"
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "code",
18 |    "execution_count": null,
19 |    "metadata": {},
20 |    "outputs": [],
21 |    "source": [
22 |     "def high_quality_filter(example):\n",
23 |     "    return (\n",
24 |     "        example['input_quality'] in ['good', 'excellent']\n",
25 |     "        and example['instruct_reward'] > -10\n",
26 |     "        and not example['instruction'].endswith(':')\n",
27 |     "        and (\n",
28 |     "            example['min_similar_conversation_id'] is None\n",
29 |     "            or example['conversation_id'] == example['min_similar_conversation_id']\n",
30 |     "        )\n",
31 |     "    )\n",
32 |     "filtered_dataset = dataset['train'].filter(high_quality_filter)\n",
33 |     "print(filtered_dataset)"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": [
42 |     "def get_output_length(example):\n",
43 |     "    return len(example['response'])\n",
44 |     "\n",
45 |     "filtered_dataset = filtered_dataset.map(lambda x: {'response_length': get_output_length(x)})\n",
46 |     "# sort the dataset by response_length\n",
47 |     "sorted_dataset = filtered_dataset.sort('response_length', reverse=True)\n",
48 |     "# select the top 300000 examples\n",
49 |     "top_x_dataset = sorted_dataset.select(range(300000))\n",
50 |     "# Shuffle the dataset\n",
51 |     "top_x_dataset = top_x_dataset.shuffle(seed=42)"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "code",
56 |    "execution_count": null,
57 |    "metadata": {},
58 |    "outputs": [],
59 |    "source": [
60 |     "# Submit the dataset to the hub\n",
61 |     "top_x_dataset.push_to_hub(\"Magpie-Align/Magpie-Qwen-Pro-300K-Filtered\", private=True)"
62 |    ]
63 |   }
64 |  ],
65 |  "metadata": {
66 |   "kernelspec": {
67 |    "display_name": "magpie",
68 |    "language": "python",
69 |    "name": "python3"
70 |   },
71 |   "language_info": {
72 |    "codemirror_mode": {
73 |     "name": "ipython",
74 |     "version": 3
75 |    },
76 |    "file_extension": ".py",
77 |    "mimetype": "text/x-python",
78 |    "name": "python",
79 |    "nbconvert_exporter": "python",
80 |    "pygments_lexer": "ipython3",
81 |    "version": "3.10.14"
82 |   }
83 |  },
84 |  "nbformat": 4,
85 |  "nbformat_minor": 2
86 | }
87 | 


--------------------------------------------------------------------------------
/recipes/README.md:
--------------------------------------------------------------------------------
 1 | # Scripts to train Magpie models
 2 | 
 3 | ## Requirements
 4 | ### Hardware
 5 | - 4 GPU @ 80GB VRAM
 6 | 
 7 | If you have fewer GPUs, please add `gradient_accumulation_steps` in the config file accordingly.
 8 | 
 9 | If you are using GPUs with less VRAM, please consider using `deepspeed zero3` or `FSDP`.
10 | 
11 | ### Software
12 | 
13 | We use [Axolotl](https://github.com/axolotl-ai-cloud/axolotl) for supervised fine-tuning, and [alignment-handbook](https://github.com/huggingface/alignment-handbook) for DPO. 
14 | 
15 | ## Supervised Fine-tuning
16 | 
17 | ### Install Axolotl
18 | Environment: Python >=3.10 and Pytorch >=2.1.1. You can setup the environment using Conda.
19 | 
20 | **Note**: We found that the latest Axolotl may encounter some bugs during the training, so we use [this commit](https://github.com/OpenAccess-AI-Collective/axolotl/commit/7c2bf3091f5e73c787afe839dfdcc8220b770a1a) in our experiments. Please also manually install FastChat for the latest Llama3 conversation template support.
21 | 
22 | ```bash
23 | git clone https://github.com/lm-sys/FastChat
24 | cd FastChat
25 | pip install -e .
26 | cd ../
27 | 
28 | git clone https://github.com/OpenAccess-AI-Collective/axolotl
29 | cd axolotl
30 | git reset --hard 7c2bf3091f5e73c787afe839dfdcc8220b770a1a
31 | 
32 | pip3 install packaging ninja
33 | pip3 install -e '.[flash-attn,deepspeed]'
34 | ```
35 | 
36 | **Note 2:** They have fixed that. Please use the latest version.
37 | 
38 | ### Run
39 | Go to the directory where you placed the SFT YAML file, then run the following command:
40 | ```
41 | accelerate launch -m axolotl.cli.train your_config_name.yaml
42 | ```
43 | 
44 | ## DPO
45 | 
46 | ### Install Alignment Handbook
47 | 
48 | ```bash
49 | conda create -n handbook python=3.10 && conda activate handbook
50 | 
51 | git clone https://github.com/huggingface/alignment-handbook.git
52 | cd alignment-handbook
53 | python -m pip install .
54 | 
55 | python -m pip install flash-attn --no-build-isolation
56 | ```
57 | 
58 | ### Run
59 | 
60 | Please change `num_processes` if you are not using 4 GPUs.
61 | 
62 | ```
63 | ACCELERATE_LOG_LEVEL=info accelerate launch --num_processes 4 --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py your_config_name.yaml
64 | ```
65 | 
66 | ## Magpie Recipes
67 | 
68 | - [Llama-3-8B-Magpie-Align-SFT-v0.1](Llama-3-8B-Magpie-Align-SFT-v0.1)
69 | - [Llama-3-8B-Magpie-Align-SFT-v0.2](Llama-3-8B-Magpie-Align-SFT-v0.2)
70 | - [Llama-3-8B-Magpie-Align-SFT-v0.3](Llama-3-8B-Magpie-Align-SFT-v0.2)
71 | - [Llama-3-8B-Magpie-Align-v0.1](Llama-3-8B-Magpie-Align-v0.1)
72 | - [Llama-3-8B-Magpie-Align-v0.2](Llama-3-8B-Magpie-Align-v0.2)
73 | - [Llama-3-8B-Magpie-Align-v0.3](Llama-3-8B-Magpie-Align-v0.3)
74 | - [Llama-3.1-8B-Magpie-Align-SFT-v0.1](Llama-3.1-8B-Magpie-Align-SFT-v0.1)
75 | - [Llama-3.1-8B-Magpie-Align-SFT-v0.2](Llama-3.1-8B-Magpie-Align-SFT-v0.2)
76 | - [Llama-3.1-8B-Magpie-Align-v0.1](Llama-3.1-8B-Magpie-Align-v0.1)
77 | - [Llama-3.1-8B-Magpie-Align-v0.2](Llama-3.1-8B-Magpie-Align-v0.2)


--------------------------------------------------------------------------------
/exp/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import requests
 3 | import uuid
 4 | from time import sleep
 5 | from fastchat.model import get_conversation_template
 6 | 
 7 | # File I/O utilities
 8 | def load_jsonl_to_list(jsonl_file_path):
 9 |     data_list = []
10 |     with open(jsonl_file_path, 'r') as file:
11 |         for line in file:
12 |             json_obj = json.loads(line)
13 |             data_list.append(json_obj)
14 |     return data_list
15 | 
16 | # Load dataset
17 | def load_dataset_from_file(filename):
18 |     #if the file is json
19 |     if filename.endswith('.json'):
20 |         with open(filename, 'r') as file:
21 |             return json.load(file)
22 |     elif filename.endswith('.jsonl'):
23 |         return load_jsonl_to_list(filename)
24 |     else:
25 |         raise ValueError("Invalid file format. Please provide a .json or .jsonl file.")
26 | 
27 | # Save dataset
28 | def save_dataset(data, filename, convert_to_jsonl=False):
29 |     if convert_to_jsonl:
30 |         with open(filename, 'w') as file:
31 |             for obj in data:
32 |                 file.write(json.dumps(obj) + '\n')
33 |     else:
34 |         with open(filename, 'w') as file:
35 |             json.dump(data, file, indent=2)
36 | 
37 | # API utilities
38 | 
39 | # Function to make a single API request with exponential back-off
40 | def make_api_request_with_retry(message, api_params, api_endpoint, api_headers, max_retries=5):
41 |     payload = api_params.copy()
42 |     payload['messages'] = message
43 | 
44 |     for attempt in range(max_retries):
45 |         try:
46 |             response = requests.post(api_endpoint, json=payload, headers=api_headers)
47 |             response.raise_for_status()  # Raises an HTTPError for bad responses
48 |             return response.json()['choices'][0]['message']['content']
49 |         except requests.RequestException as e:
50 |             print(f"Attempt {attempt + 1} failed: {str(e)}")
51 |             sleep(2 ** attempt)  # Exponential back-off
52 |     
53 |     print("All retry attempts failed.")
54 |     return None
55 | 
56 | 
57 | # Template utilities
58 | def apply_template(model_name):
59 |     if "llama-3" in model_name.lower():
60 |         conv = get_conversation_template("llama-3")
61 |     elif "llama3" in model_name.lower():
62 |         conv = get_conversation_template("llama-3")
63 |     elif "gemma" in model_name.lower():
64 |         conv = get_conversation_template("gemma")
65 |     elif "qwen" in model_name.lower():
66 |         conv = get_conversation_template("qwen-7b-chat")
67 |     elif "zephyr" in model_name.lower():
68 |         conv = get_conversation_template("zephyr")
69 |     elif "llama-2" in model_name.lower():
70 |         conv = get_conversation_template("llama-2")
71 |     elif "tulu" in model_name.lower():
72 |         conv = get_conversation_template("tulu")
73 |     elif "mixtral" in model_name.lower() or "mistral" in model_name.lower():
74 |         conv = get_conversation_template("mistral")
75 |     elif "yi" in model_name.lower() and "chat" in model_name.lower():
76 |         conv = get_conversation_template("Yi-34b-chat")
77 |     elif "vicuna" in model_name.lower():
78 |         conv = get_conversation_template("vicuna_v1.1")
79 |     else:
80 |         raise ValueError(f"ERROR: model_name {model_name} not supported for applying templates!")
81 | 
82 |     return conv
83 | 
84 | 
85 | # UUID
86 | def generate_uuid(name):
87 |     namespace = uuid.NAMESPACE_DNS
88 |     return str(uuid.uuid5(namespace, name))


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | data/obtain_labels_from_exisiting_files.ipynb
164 | data/final_dataset_process.ipynb
165 | Magpie Data/
166 | data_po_private/
167 | data_po_bon/
168 | data_gemma/
169 | data_llama3.1/
170 | data_qwen2.5/


--------------------------------------------------------------------------------
/exp/gen_po_rewards.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import requests
  5 | import concurrent.futures
  6 | from time import sleep
  7 | from tqdm import tqdm
  8 | import argparse
  9 | import torch
 10 | from typing import Dict, List
 11 | from utils import load_dataset_from_file, save_dataset
 12 | from transformers import AutoModelForSequenceClassification, AutoTokenizer
 13 | 
 14 | class ArmoRMPipeline:
 15 |     def __init__(self, model_id, device_map="auto", torch_dtype=torch.bfloat16, truncation=True, trust_remote_code=False, max_length=4096):
 16 |         self.model = AutoModelForSequenceClassification.from_pretrained(
 17 |             model_id,
 18 |             device_map=device_map,
 19 |             trust_remote_code=trust_remote_code,
 20 |             torch_dtype=torch_dtype,
 21 |         )
 22 |         self.tokenizer = AutoTokenizer.from_pretrained(
 23 |             model_id,
 24 |             use_fast=True,
 25 |         )
 26 |         self.truncation = truncation
 27 |         self.device = self.model.device
 28 |         self.max_length = max_length
 29 | 
 30 |     def __call__(self, messages: List[Dict[str, str]]) -> Dict[str, float]:
 31 |         """
 32 |         messages: OpenAI chat messages to be scored
 33 |         Note: no batching since due to length differences, the model will have to pad to the max length which is not efficient
 34 |         Returns: a dictionary with the score between 0 and 1
 35 |         """
 36 |         input_ids = self.tokenizer.apply_chat_template(
 37 |             messages,
 38 |             return_tensors="pt",
 39 |             padding=True,
 40 |             truncation=self.truncation,
 41 |             max_length=self.max_length,
 42 |         ).to(self.device)
 43 |         with torch.no_grad():
 44 |             output = self.model(input_ids)
 45 |             score = output.score.float().item()
 46 |         return {"score": score}
 47 | 
 48 | ################
 49 | # Configurations
 50 | ################
 51 | def get_args():
 52 |     # Experiment Settings
 53 |     parser = argparse.ArgumentParser(description="Tagging Manager.")
 54 |     parser.add_argument("--input_file", type=str, default=None,
 55 |                         help="Input dataset file name")
 56 |     parser.add_argument("--batch_size", type=int, default=1, help="Number of samples per batch.")
 57 |     parser.add_argument("--checkpoint_every", type=int, default=5000, help="Save checkpoint every n batches")
 58 | 
 59 |     # Generation Configs
 60 |     parser.add_argument("--device", type=int, default=0)
 61 |     parser.add_argument("--max_tokens", type=int, default=1024)
 62 |     parser.add_argument("--top_k", type=int, default=1) # 1 means greedy decoding
 63 |     parser.add_argument("--repetition_penalty", type=float, default=1.0)
 64 |     parser.add_argument("--stop_tokens", type=str, default="<|eot_id|>", help="Stop token")
 65 | 
 66 |     return parser.parse_args()
 67 | args = get_args()
 68 | 
 69 | checkpoint_every = args.checkpoint_every
 70 | batch_size = args.batch_size
 71 | 
 72 | # Generate outputs, update dataset in batches, and overwrite checkpoint
 73 | def generate_and_update(dataset, model, checkpoint_file, checkpoint_every = 20):
 74 |     if os.path.exists(checkpoint_file):
 75 |         last_checkpoint_idx = len(load_dataset_from_file(checkpoint_file))
 76 |         print(f"Checkpoint file found. Resuming from last checkpoint with index {last_checkpoint_idx}.")
 77 |         dataset[:last_checkpoint_idx] = load_dataset_from_file(checkpoint_file)
 78 |         num_batches = (len(dataset) - last_checkpoint_idx + batch_size - 1) // batch_size
 79 |         print(f"Remaining number of batches: {num_batches}")
 80 |     else:
 81 |         last_checkpoint_idx = 0
 82 |         num_batches = (len(dataset) + batch_size - 1) // batch_size  # Calculate total number of batches
 83 |         print(f"Total number of batches: {num_batches}")
 84 | 
 85 |     for i in tqdm(range(num_batches)):
 86 |         start_idx = i * batch_size + last_checkpoint_idx
 87 |         end_idx = min((i + 1) * batch_size + last_checkpoint_idx, len(dataset))
 88 |         batch = dataset[start_idx:end_idx]
 89 | 
 90 |         for item in batch:
 91 |             rewards = []
 92 |             instruction = item['instruction']
 93 |             for response in item['responses']:
 94 |                 reward_armorm = model([{"role": "user", "content": instruction}, {"role": "assistant", "content": response}])
 95 |                 rewards.append(reward_armorm)
 96 | 
 97 |             item['rewards_armorm'] = rewards
 98 | 
 99 |         dataset[start_idx:end_idx] = batch
100 |         # Overwrite the same checkpoint file every checkpoint_every batches
101 | 
102 |         if (i + 1) % checkpoint_every == 0:
103 |             save_dataset(dataset[:end_idx], checkpoint_file)
104 |             print(f"Dataset checkpoint saved after batch {i + 1}.")
105 |         i += 1
106 | 
107 |     return dataset
108 | 
109 | # main
110 | input_file = args.input_file
111 | output_file = f"{input_file[:input_file.rfind('.')]}_armorm.json"
112 | checkpoint_file = f"{input_file[:input_file.rfind('.')]}_armorm_checkpoint.json"
113 | dataset = load_dataset_from_file(input_file)
114 | 
115 | model = ArmoRMPipeline("RLHFlow/ArmoRM-Llama3-8B-v0.1", trust_remote_code=True, device_map=f"cuda:{args.device}")
116 | 
117 | updated_dataset = generate_and_update(dataset, model, checkpoint_file, checkpoint_every)
118 | 
119 | save_dataset(updated_dataset, output_file, convert_to_jsonl=False)
120 | 
121 | # Remove the checkpoint file after completion
122 | if os.path.exists(checkpoint_file):
123 |     os.remove(checkpoint_file)
124 |     print("Final dataset saved. Checkpoint removed.")


--------------------------------------------------------------------------------
/data_po/example_instructions.jsonl:
--------------------------------------------------------------------------------
1 | {"uuid": "8cabba98-e7fa-52f7-a778-ab847f36f123", "instruction": "I have code which is sending data to an outgoing web service using POST request. The data is being stored on a RESTful API. However, I want to store this data locally as well, so that even if the web service goes down, I can still use the data locally. This is a real-time application where the data wont be queued and shall be stored directly.", "gen_input_configs": {"temperature": 1.1, "top_p": 1.0}, "intent": "The user wants to store data locally in addition to sending it to a web service, ensuring data availability even if the web service goes down.", "knowledge": "To solve this problem, the models need to know about data storage, web services, and real-time data processing.", "difficulty": "medium", "input_quality": "good", "quality_explanation": "The user query is clear and specific about the problem they are trying to solve, which is storing data locally in addition to sending it to a web service. The query also provides some context about the application being real-time and not queuing data. However, the query could be improved by providing more details about the specific issues they are experiencing with their current implementation, such as errors or performance issues. Additionally, the query could benefit from more specific information about the data being sent and the local storage requirements.", "task_category": "Coding & Debugging", "input_length": 344}
2 | {"uuid": "d668c3c7-b846-5a56-a45a-057df0cdf61d", "instruction": "I need assistance in scripting MSI ACLs (Access Control Lists) for a specific Windows Server application using PowerShell.", "gen_input_configs": {"temperature": 1.2, "top_p": 0.99}, "intent": "The user wants to script MSI ACLs (Access Control Lists) for a specific Windows Server application using PowerShell.", "knowledge": "To solve this problem, the models need to know PowerShell scripting, Windows Server ACLs, and MSI (Microsoft Installer) configuration.", "difficulty": "medium", "input_quality": "good", "quality_explanation": "The query is clear and specific about the topic (MSI ACLs for a Windows Server application using PowerShell), but it lacks context about the specific requirements or issues the user is facing. The query is well-structured and easy to understand, but it does not provide enough information for a comprehensive response.", "task_category": "Coding & Debugging", "input_length": 122}
3 | {"uuid": "df1d3e99-99a6-5292-955e-4392d6e6d81e", "instruction": "I am a developer and I have a custom app that is doing some complex data processing. You are a simulator for this application. I want to test how my application behaves in different scenarios. I can pass parameters to the app through a command line argument. I can also print output to the console or write to a file.", "gen_input_configs": {"temperature": 1.0, "top_p": 0.995}, "intent": "The user wants to test their custom application in different scenarios and simulate its behavior by passing parameters through command line arguments and printing output to the console or writing to a file.", "knowledge": "To solve this problem, the models need to know about command line arguments, console output, and file writing, as well as the syntax and functionality of the custom application.", "difficulty": "medium", "input_quality": "good", "quality_explanation": "The user query is clear and specific about the context of the application and the desired functionality. The user provides relevant details about the app, such as the ability to pass parameters through command-line arguments and print output to the console or file. However, the query could be improved by providing more specific information about the scenarios they want to test and the expected output.", "task_category": "Coding & Debugging", "input_length": 317}
4 | {"uuid": "8c617c92-de2e-536a-a347-9b92b353c760", "instruction": "I want to build a dashboard that shows a graph of the current price of a stock. I want to real-time data to be updated automatically. I will be using Python as the backend and dash library as the frontend to create the dashboard. To get the real-time data, I will be using the Alpha Vantage API.", "gen_input_configs": {"temperature": 1.0, "top_p": 0.99}, "intent": "The user wants to build a real-time dashboard to display the current price of a stock using Python and the Dash library, with real-time data updated automatically using the Alpha Vantage API.", "knowledge": "To solve this problem, the models need to know Python programming, Dash library, Alpha Vantage API, and basic data visualization concepts.", "difficulty": "medium", "input_quality": "good", "quality_explanation": "The user query is clear and specific about the requirements for building a dashboard, including the use of Python, Dash library, and Alpha Vantage API. The query also mentions the need for real-time data updates. However, it could be improved by providing more context about the type of graph and the specific stock(s) to be tracked. Additionally, the query assumes that the user has prior knowledge of the Dash library and Alpha Vantage API, which may not be the case for all users.", "task_category": "Coding & Debugging", "input_length": 295}
5 | {"uuid": "9dd10bcf-5fa8-53a6-b520-a96234989e23", "instruction": "I'm running an affiliate marketing campaign and I am exporting an affiliate network's API data to a custom dashboard using SQL and Python.", "gen_input_configs": {"temperature": 1.1, "top_p": 1.0}, "intent": "The user wants to analyze and visualize affiliate marketing campaign data using SQL and Python", "knowledge": "To solve this problem, the models need to know SQL, Python, and affiliate marketing concepts", "difficulty": "medium", "input_quality": "good", "quality_explanation": "The query is clear and specific about the context of the affiliate marketing campaign and the technical details of exporting data from an affiliate network's API to a custom dashboard using SQL and Python. The query lacks some context about the specific issues or problems the user is facing, but it provides a good foundation for understanding the user's intent.", "task_category": "Coding & Debugging", "input_length": 138}


--------------------------------------------------------------------------------
/exp/gen_dis.py:
--------------------------------------------------------------------------------
  1 | from datasets import load_dataset
  2 | from sentence_transformers import SentenceTransformer
  3 | import torch
  4 | import numpy as np
  5 | import faiss
  6 | import argparse
  7 | import json
  8 | from tqdm import tqdm
  9 | from utils import load_dataset_from_file
 10 | 
 11 | ################
 12 | # Configurations
 13 | ################
 14 | def get_args():
 15 |     # Experiment Settings
 16 |     parser = argparse.ArgumentParser(description="Similarity Calculation Manager.")
 17 |     parser.add_argument("--sentence_model", type=str, default="sentence-transformers/all-mpnet-base-v2")
 18 |     parser.add_argument("--input_file", type=str, default=None, help="Input dataset file name")
 19 |     parser.add_argument("--encoding_batch_size", type=int, default=65536, help="Batch size for encoding the sentences.")
 20 |     parser.add_argument("--distance_distance_threshold", type=float, default=0.05, help="distance_threshold for the similarity search.")
 21 |     parser.add_argument("--search_space_size", type=int, default=500, help="Number of examples to search for similarity.")
 22 |     parser.add_argument("--search_batch_size", type=int, default=1024, help="Batch size for searching for similarity.")
 23 | 
 24 |     # System Settings
 25 |     parser.add_argument("--device", type=int, default=0)
 26 |     parser.add_argument("--save_faiss_index", type=bool, default=True, help="Save the Faiss index.")
 27 |     
 28 |     return parser.parse_args()
 29 | 
 30 | args = get_args()
 31 | 
 32 | sentence_model = args.sentence_model
 33 | dataset_path = args.input_file
 34 | dataset_name = dataset_path[dataset_path.rfind('/')+1:dataset_path.rfind('.')]
 35 | output_file = f"../data/{dataset_name}_distance.jsonl"
 36 | 
 37 | ################
 38 | # Step 1 - Load the Dataset and Build the Faiss Index
 39 | ################
 40 | # Load the dataset
 41 | dataset = load_dataset("json", data_files=dataset_path)
 42 | print(dataset)
 43 | inputs = dataset["train"]["instruction"]
 44 | print(f"The second instruction in the dataset is: {inputs[1]}")
 45 | 
 46 | model = SentenceTransformer(sentence_model)
 47 | model.to(device=f'cuda:{args.device}', dtype=torch.float32)
 48 | print(f"The model is loaded on device: {model.device}")
 49 | 
 50 | # Encode the sentences in the dataset into vectors
 51 | encoding_batch_size = args.encoding_batch_size  # Adjust the batch size based on available memory
 52 | embeddings = []
 53 | for i in range(0, len(inputs), encoding_batch_size):
 54 |     batch_sentences = inputs[i:i+encoding_batch_size]
 55 |     batch_embeddings = model.encode(batch_sentences, convert_to_tensor=True, show_progress_bar=True)
 56 |     embeddings.append(batch_embeddings.cpu().numpy())
 57 | 
 58 | # Concatenate the embeddings
 59 | embeddings = np.concatenate(embeddings, axis=0)
 60 | 
 61 | # Print out the shape of the concatenated embeddings to verify the results
 62 | print(f"The shape of the concatenated embeddings is: {embeddings.shape}")
 63 | 
 64 | # Add the encoded vectors to the dataset
 65 | print("Adding the embeddings to the dataset...")
 66 | dataset["train"] = dataset["train"].add_column("embeddings", embeddings.tolist())
 67 | 
 68 | # Build the Faiss index on the dataset
 69 | print("Building the Faiss index...")
 70 | dataset["train"].add_faiss_index(column="embeddings")
 71 | 
 72 | # Save the Faiss index
 73 | if args.save_faiss_index:
 74 |     print("Saving the Faiss index...")
 75 |     index = dataset["train"].get_index("embeddings")
 76 |     faiss_index = index.faiss_index
 77 |     index_file = f"../data/{dataset_name}.faiss"
 78 |     faiss.write_index(faiss_index, index_file)
 79 | 
 80 | ################
 81 | # Step 2 - Find Similar Examples
 82 | ################
 83 | distance_threshold = args.distance_distance_threshold
 84 | search_space_size = args.search_space_size
 85 | search_batch_size = args.search_batch_size
 86 | n_batches = (len(dataset["train"]) + search_batch_size - 1) // search_batch_size
 87 | print(f"Number of batches: {n_batches}")
 88 | 
 89 | # load the dataset in jsonl format
 90 | unfilled_dataset = load_dataset_from_file(dataset_path)
 91 | 
 92 | with open(output_file, 'a') as file:
 93 |     for batch_idx in tqdm(range(n_batches)):
 94 |         start_idx = batch_idx * search_batch_size
 95 |         end_idx = min((batch_idx + 1) * search_batch_size, len(dataset["train"]))
 96 | 
 97 |         batch_indices = list(range(start_idx, end_idx))
 98 |         
 99 |         # Obtain the embeddings for the current batch
100 |         batch_embeddings = embeddings[batch_indices]
101 |         
102 |         # Search for the most similar examples
103 |         search_results = dataset["train"].search_batch(queries=batch_embeddings, k=search_space_size, index_name="embeddings")
104 |         total_scores = search_results.total_scores
105 |         total_indices = search_results.total_indices
106 | 
107 |         for i in range(len(total_indices)):
108 |             scores = total_scores[i]
109 |             indices = total_indices[i]
110 |             min_distance = float(scores[1]) # should exclude itself
111 |             dataset["train"][start_idx + i]["min_distance"] = min_distance
112 | 
113 |             # Filter indices based on the distance threshold
114 |             filtered_indices = [index for index, score in zip(indices, scores) if score < distance_threshold]
115 |             # Remove itself from the filtered indices
116 |             filtered_indices = [index for index in filtered_indices if index != start_idx + i]
117 | 
118 |             if len(filtered_indices) == 0:
119 |                 repeat_count = 0
120 |                 min_similar_conversation_id = None
121 | 
122 |                 dataset["train"][start_idx + i]["repeat_count"] = repeat_count
123 |                 dataset["train"][start_idx + i]["min_similar_conversation_id"] = min_similar_conversation_id
124 |             else:
125 |                 min_similar_conversation_idx = int(min(filtered_indices))
126 |                 if min_similar_conversation_idx >= start_idx + i:
127 |                     min_similar_conversation_id = dataset["train"][start_idx + i]["conversation_id"]
128 |                 else: 
129 |                     min_similar_conversation_id = dataset["train"][min_similar_conversation_idx]["conversation_id"]
130 |                 
131 |                 repeat_count = len(filtered_indices)
132 | 
133 |                 dataset["train"][start_idx + i]["repeat_count"] = repeat_count
134 |                 dataset["train"][start_idx + i]["min_similar_conversation_id"] = min_similar_conversation_id
135 | 
136 |             # save the updated dataset
137 |             line = unfilled_dataset[start_idx + i]
138 |             line["min_neighbor_distance"] = min_distance
139 |             line["repeat_count"] = repeat_count
140 |             line["min_similar_conversation_id"] = min_similar_conversation_id
141 |             file.write(json.dumps(line) + '\n')
142 |             
143 |         print(f"Batch {batch_idx} is saved to the output file")
144 | 
145 | print("Distance calculation is completed.")
146 | 
147 | 


--------------------------------------------------------------------------------
/scripts/unitag.sh:
--------------------------------------------------------------------------------
  1 | input_file=${1:-"none"}
  2 | tag_mission=${2:-"all"}
  3 | device=${3:-"0"}
  4 | model_path=${4:-"meta-llama/Meta-Llama-3-8B-Instruct"}
  5 | guard_model_path="meta-llama/Meta-Llama-Guard-2-8B"
  6 | reward_model_path="sfairXC/FsfairX-LLaMA3-RM-v0.1"
  7 | tensor_parallel=1
  8 | gpu_memory_utilization=0.95
  9 | batch_size=1000
 10 | 
 11 | if [ $input_file == "none" ]; then
 12 |     echo "[magpie.sh] Input file not provided!"
 13 |     exit 1
 14 | fi
 15 | if [ ! -f $input_file ]; then
 16 |     echo "[magpie.sh] Input file not found!"
 17 |     exit 1
 18 | fi
 19 | 
 20 | # get job path from input file
 21 | job_path=$(dirname "$input_file")
 22 | exec > >(tee -a "$job_path/tagging.log") 2>&1
 23 | echo "[magpie.sh] Job Path: $job_path"
 24 | echo "[magpie.sh] Input File: $input_file"
 25 | echo "[magpie.sh] Tagging Mission: $tag_mission"
 26 | echo "[magpie.sh] Model Name: $model_path"
 27 | echo "[magpie.sh] System Config: device=$device, batch_size=$batch_size, tensor_parallel=$tensor_parallel"
 28 | 
 29 | if [ $tag_mission == "difficulty" ] || [ $tag_mission == "all" ]; then
 30 |     echo "[magpie.sh] Start Generating Difficulty Tags..."
 31 |     CUDA_VISIBLE_DEVICES=$device python ../exp/unitag.py \
 32 |         --device $device \
 33 |         --model_path $model_path \
 34 |         --input_file $input_file \
 35 |         --tag_mission "difficulty" \
 36 |         --tensor_parallel $tensor_parallel \
 37 |         --gpu_memory_utilization $gpu_memory_utilization \
 38 |         --batch_size $batch_size \
 39 | 
 40 |     echo "[magpie.sh] Finish Generating Difficulty Tags!"
 41 | 
 42 |     # Change input file name to difficulty tagged file
 43 |     input_file_name=$(basename $input_file)
 44 |     input_file_dir=$(dirname $input_file)
 45 |     input_file_name_no_ext="${input_file_name%.*}"
 46 |     input_file_ext="${input_file_name##*.}"
 47 |     difficulty_tag_file="${input_file_dir}/${input_file_name_no_ext}_difficulty.${input_file_ext}"
 48 |     input_file=$difficulty_tag_file
 49 |     echo "[magpie.sh] Difficulty Tagged File: $input_file"
 50 | fi
 51 | 
 52 | if [ $tag_mission == "quality" ] || [ $tag_mission == "all" ]; then
 53 |     echo "[magpie.sh] Start Generating Quality Tags..."
 54 |     CUDA_VISIBLE_DEVICES=$device python ../exp/unitag.py \
 55 |         --device $device \
 56 |         --model_path $model_path \
 57 |         --input_file $input_file \
 58 |         --tag_mission "quality" \
 59 |         --tensor_parallel $tensor_parallel \
 60 |         --gpu_memory_utilization $gpu_memory_utilization \
 61 |         --batch_size $batch_size \
 62 | 
 63 |     echo "[magpie.sh] Finish Generating Quality Tags!"
 64 | 
 65 |     # Change input file name to quality tagged file
 66 |     input_file_name=$(basename $input_file)
 67 |     input_file_dir=$(dirname $input_file)
 68 |     input_file_name_no_ext="${input_file_name%.*}"
 69 |     input_file_ext="${input_file_name##*.}"
 70 |     quality_tag_file="${input_file_dir}/${input_file_name_no_ext}_quality.${input_file_ext}"
 71 |     input_file=$quality_tag_file
 72 |     echo "[magpie.sh] Quality Tagged File: $input_file"
 73 | fi
 74 | 
 75 | if [ $tag_mission == "classification" ] || [ $tag_mission == "all" ]; then
 76 |     echo "[magpie.sh] Start Generating Task Tags..."
 77 |     CUDA_VISIBLE_DEVICES=$device python ../exp/unitag.py \
 78 |         --device $device \
 79 |         --model_path $model_path \
 80 |         --input_file $input_file \
 81 |         --tag_mission "classification" \
 82 |         --tensor_parallel $tensor_parallel \
 83 |         --gpu_memory_utilization $gpu_memory_utilization \
 84 |         --batch_size $batch_size \
 85 | 
 86 |     echo "[magpie.sh] Finish Generating Task Tags!"
 87 | 
 88 |     # Change input file name to task tagged file
 89 |     input_file_name=$(basename $input_file)
 90 |     input_file_dir=$(dirname $input_file)
 91 |     input_file_name_no_ext="${input_file_name%.*}"
 92 |     input_file_ext="${input_file_name##*.}"
 93 |     task_tag_file="${input_file_dir}/${input_file_name_no_ext}_category.${input_file_ext}"
 94 |     input_file=$task_tag_file
 95 |     echo "[magpie.sh] Task Tagged File: $input_file"
 96 | fi
 97 | 
 98 | if [ $tag_mission == "safety" ] || [ $tag_mission == "all" ]; then
 99 |     echo "[magpie.sh] Start Generating Safety Tags..."
100 |     CUDA_VISIBLE_DEVICES=$device python ../exp/unitag.py \
101 |         --device $device \
102 |         --guard_model_path $guard_model_path \
103 |         --input_file $input_file \
104 |         --tag_mission "safety" \
105 |         --tensor_parallel $tensor_parallel \
106 |         --gpu_memory_utilization $gpu_memory_utilization \
107 |         --batch_size $batch_size \
108 | 
109 |     echo "[magpie.sh] Finish Generating Safety Tags!"
110 | 
111 |     # Change input file name to quality tagged file
112 |     input_file_name=$(basename $input_file)
113 |     input_file_dir=$(dirname $input_file)
114 |     input_file_name_no_ext="${input_file_name%.*}"
115 |     input_file_ext="${input_file_name##*.}"
116 |     safety_tag_file="${input_file_dir}/${input_file_name_no_ext}_safety.${input_file_ext}"
117 |     input_file=$safety_tag_file
118 |     echo "[magpie.sh] Safety Tagged File: $input_file"
119 | fi
120 | 
121 | if [ $tag_mission == "reward" ] || [ $tag_mission == "all" ]; then
122 |     echo "[magpie.sh] Start Generating Reward Tags..."
123 |     python ../exp/unitag.py \
124 |         --device $device \
125 |         --reward_model_path $reward_model_path \
126 |         --input_file $input_file \
127 |         --tag_mission "reward" \
128 |         --tensor_parallel $tensor_parallel \
129 |         --batch_size 1 \
130 | 
131 |     echo "[magpie.sh] Finish Generating Reward Tags!"
132 | 
133 |     # Change input file name to quality tagged file
134 |     input_file_name=$(basename $input_file)
135 |     input_file_dir=$(dirname $input_file)
136 |     input_file_name_no_ext="${input_file_name%.*}"
137 |     input_file_ext="${input_file_name##*.}"
138 |     reward_tag_file="${input_file_dir}/${input_file_name_no_ext}_reward.${input_file_ext}"
139 |     input_file=$reward_tag_file
140 |     echo "[magpie.sh] Reward Tagged File: $input_file"
141 | fi
142 | 
143 | if [ $tag_mission == "language" ] || [ $tag_mission == "all" ]; then
144 |     echo "[magpie.sh] Start Generating Language Tags..."
145 |     CUDA_VISIBLE_DEVICES=$device python ../exp/unitag.py \
146 |         --device $device \
147 |         --input_file $input_file \
148 |         --tag_mission "language" \
149 | 
150 |     echo "[magpie.sh] Finish Generating Language Tags!"
151 | 
152 |     # Change input file name to quality tagged file
153 |     input_file_name=$(basename $input_file)
154 |     input_file_dir=$(dirname $input_file)
155 |     input_file_name_no_ext="${input_file_name%.*}"
156 |     input_file_ext="${input_file_name##*.}"
157 |     language_tag_file="${input_file_dir}/${input_file_name_no_ext}_language.${input_file_ext}"
158 |     input_file=$language_tag_file
159 |     echo "[magpie.sh] Language Tagged File: $input_file"
160 | fi
161 | 
162 | echo "[magpie.sh] Finish Tagging Mission: $tag_mission"


--------------------------------------------------------------------------------
/demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Magpie Toy Example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
 20 |      ]
 21 |     },
 22 |     {
 23 |      "data": {
 24 |       "application/vnd.jupyter.widget-view+json": {
 25 |        "model_id": "848f65bd3c1243fea1e91865f48489dd",
 26 |        "version_major": 2,
 27 |        "version_minor": 0
 28 |       },
 29 |       "text/plain": [
 30 |        "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
 31 |       ]
 32 |      },
 33 |      "metadata": {},
 34 |      "output_type": "display_data"
 35 |     },
 36 |     {
 37 |      "name": "stderr",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "import transformers\n",
 46 |     "import torch\n",
 47 |     "import json\n",
 48 |     "from transformers import AutoTokenizer\n",
 49 |     "from fastchat.model import get_conversation_template\n",
 50 |     "\n",
 51 |     "model_id = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
 52 |     "tokenizer = AutoTokenizer.from_pretrained(model_id, add_special_tokens=True)\n",
 53 |     "\n",
 54 |     "pipeline = transformers.pipeline(\n",
 55 |     "    \"text-generation\",\n",
 56 |     "    model=model_id,\n",
 57 |     "    model_kwargs={\"torch_dtype\": torch.bfloat16},\n",
 58 |     "    device=\"cuda:0\",\n",
 59 |     ")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "#### Let's prepare the left-side template"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 2,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n",
 79 |       "\n",
 80 |       "\n"
 81 |      ]
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "with open(\"configs/model_configs.json\", \"r\") as f:\n",
 86 |     "    model_configs = json.load(f)\n",
 87 |     "    model_config = model_configs[model_id]\n",
 88 |     "\n",
 89 |     "# Prompt for extracting instructions from Llama-3-8B-Instruct\n",
 90 |     "pre_query_template = model_config[\"pre_query_template\"]\n",
 91 |     "print(pre_query_template)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "#### Step 1: Extracting Instructions"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 3,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "name": "stderr",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n"
111 |      ]
112 |     },
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "Extracted Instruction: I'd like to get a new pair of shoes for the summer. What kind of shoes are good for warm weather?\n"
118 |      ]
119 |     }
120 |    ],
121 |    "source": [
122 |     "terminators = [\n",
123 |     "    tokenizer.eos_token_id,\n",
124 |     "    tokenizer.convert_tokens_to_ids(\"<|eot_id|>\"),\n",
125 |     "]\n",
126 |     "\n",
127 |     "instruction = pipeline(\n",
128 |     "    pre_query_template,\n",
129 |     "    max_new_tokens=2048,\n",
130 |     "    eos_token_id=terminators,\n",
131 |     "    do_sample=True,\n",
132 |     "    temperature=1,\n",
133 |     "    top_p=1,\n",
134 |     ")\n",
135 |     "\n",
136 |     "sanitized_instruction = instruction[0]['generated_text'][len(pre_query_template):].split(\"\\n\")[0]\n",
137 |     "print(f\"Extracted Instruction: {sanitized_instruction}\")"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "#### Step 2: Generating Responses"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 4,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "name": "stderr",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n"
157 |      ]
158 |     },
159 |     {
160 |      "name": "stdout",
161 |      "output_type": "stream",
162 |      "text": [
163 |       "Conversation(name='llama-3', system_template='<|start_header_id|>system<|end_header_id|>\\n\\n{system_message}<|eot_id|>', system_message='', roles=('user', 'assistant'), messages=[], offset=0, sep_style=<SeparatorStyle.LLAMA3: 8>, sep='', sep2=None, stop_str='<|eot_id|>', stop_token_ids=[128001, 128009], max_image_size_mb=None)\n"
164 |      ]
165 |     }
166 |    ],
167 |    "source": [
168 |     "conv = get_conversation_template(\"llama-3\")\n",
169 |     "print(conv) # Make sure the conversation template is correct with name='llama-3' !!!\n",
170 |     "conv.append_message(conv.roles[0], sanitized_instruction)\n",
171 |     "conv.append_message(conv.roles[1], None)\n",
172 |     "prompt = conv.get_prompt()\n",
173 |     "\n",
174 |     "response = pipeline(\n",
175 |     "    prompt,\n",
176 |     "    max_new_tokens=2048,\n",
177 |     "    eos_token_id=terminators,\n",
178 |     "    do_sample=True,\n",
179 |     "    temperature=1,\n",
180 |     "    top_p=1,\n",
181 |     ")"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "### We have successfully created an instruction-response pair!"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 5,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "name": "stdout",
198 |      "output_type": "stream",
199 |      "text": [
200 |       "User's message: I'd like to get a new pair of shoes for the summer. What kind of shoes are good for warm weather?\n",
201 |       "Assitant's response: Perfect timing! When it comes to shoes for warm weather, you'll want something comfortable, breathable, and stylish. Here are some great options:\n",
202 |       "\n",
203 |       "1. **Sandals**: A classic choice for summer, sandals are perfect for walking, beach trips, or lounging around. Look for ones with good arch support and adjustable straps.\n",
204 |       "2. **Sneakers with breathable materials**: Sneakers made from mesh, canvas, or mesh panels are great for hot weather. Brands like New Balance, ASICS, or Converse offer such options.\n",
205 |       "3. **Espadrilles**: These flat, rope-soled shoes are perfect for casual outings. They're lightweight, breathable, and can be dressed up or down.\n",
206 |       "4. **Slide sandals**: Similar to traditional sandals, but with a more relaxed design. They're great for poolside or beach activities.\n",
207 |       "5. **Loafers**: A stylish option for warm weather, loafers are lightweight, comfortable, and can be dressed up or down.\n",
208 |       "6. **Water shoes**: If you plan to spend time in or around water (e.g., swimming, kayaking, or paddleboarding), waterproof or water-friendly shoes are a must-have.\n",
209 |       "7. **Mules**: These backless shoes are perfect for warm weather. Look for ones with a soft, breathable upper and a cushioned insole.\n",
210 |       "\n",
211 |       "When shopping for shoes for warm weather, consider the following features:\n",
212 |       "\n",
213 |       "* Breathable materials (e.g., mesh, canvas, leather with mesh panels)\n",
214 |       "* Lightweight construction\n",
215 |       "* Adjustable straps or closures for a secure fit\n",
216 |       "* Cushioned insoles for added comfort\n",
217 |       "* Water-friendly or quick-drying materials (if you plan to get wet)\n",
218 |       "* A simple, casual design for easy pairing with lightweight outfits\n",
219 |       "\n",
220 |       "Which type of shoes do you think you might be interested in?\n"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "print(f\"User's message: {sanitized_instruction}\")\n",
226 |     "print(f\"Assitant's response: {response[0]['generated_text'][len(prompt):]}\")"
227 |    ]
228 |   }
229 |  ],
230 |  "metadata": {
231 |   "kernelspec": {
232 |    "display_name": "datareverse",
233 |    "language": "python",
234 |    "name": "python3"
235 |   },
236 |   "language_info": {
237 |    "codemirror_mode": {
238 |     "name": "ipython",
239 |     "version": 3
240 |    },
241 |    "file_extension": ".py",
242 |    "mimetype": "text/x-python",
243 |    "name": "python",
244 |    "nbconvert_exporter": "python",
245 |    "pygments_lexer": "ipython3",
246 |    "version": "3.10.14"
247 |   }
248 |  },
249 |  "nbformat": 4,
250 |  "nbformat_minor": 2
251 | }
252 | 


--------------------------------------------------------------------------------
/navigation.md:
--------------------------------------------------------------------------------
 1 | # Dataset Navigation 🧭
 2 | 
 3 | ## Alignment Datasets
 4 | 
 5 | ### [**Meta Llama 3.3**](https://huggingface.co/collections/meta-llama/llama-33-67531d5c405ec5d08a852000)
 6 | |Model Name | Dataset | Type | Description |
 7 | |-------------|:-------|:-------|:-------|
 8 | | [Llama 3.3 70B Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) | [Magpie-Llama-3.3-Pro-1M](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.3-Pro-1M-v0.1) | SFT | 1M Raw conversations built with Meta Llama 3.3 70B.
 9 | | [Llama 3.3 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.3-70B-Instruct) | [Magpie-Llama-3.3-Pro-500K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.3-Pro-500K-Filtered) | SFT | Apply a filter and select 500K high quality conversations.
10 | 
11 | ### [**Meta Llama 3.1**](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
12 | |Model Name | Dataset | Type | Description |
13 | |-------------|:-------|:-------|:-------|
14 | | [Llama 3.1 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | [Magpie-Llama-3.1-Pro-1M](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-1M-v0.1) | SFT | 1M Raw conversations built with Meta Llama 3.1 70B.
15 | | [Llama 3.1 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | [Magpie-Llama-3.1-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-300K-Filtered) | SFT | Apply a filter and select 300K high quality conversations.
16 | | [Llama 3.1 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | [Magpie-Llama-3.1-Pro-500K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-500K-Filtered) | SFT | Apply a filter and select 500K high quality conversations.
17 | | [Llama 3.1 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | [Magpie-Llama-3.1-Pro-MT-500K](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-MT-500K-v0.1) | SFT | Extend Magpie-Llama-3.1-Pro-500K-Filtered to multi-turn.
18 | | [Llama 3.1 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | [Magpie-Llama-3.1-Pro-MT-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-MT-300K-Filtered) | SFT | Select 300K high quality multi-turn conversations from Magpie-Llama-3.1-Pro-MT-500K.
19 | | [Llama 3.1 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | [Magpie-Llama-3.1-Pro-DPO-100K](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-DPO-100K-v0.1) | DPO | DPO dataset via Best-of-N sampling and rewards.
20 | 
21 | ### [**Meta Llama 3**](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
22 | |Model Name | Dataset | Type | Description |
23 | |-------------|:-------|:-------|:-------|
24 | | [Llama 3 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | [Magpie-Pro-1M](https://huggingface.co/datasets/Magpie-Align/Llama-3-Magpie-Pro-1M-v0.1) | SFT | 1M Raw conversations built with Meta Llama 3 70B.
25 | | [Llama 3 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | [Magpie-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered) | SFT | Apply a filter and select 300K high quality conversations.
26 | | [Llama 3 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | [Magpie-Pro-MT-300K](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-MT-300K-v0.1) | SFT | Select 300K difficult questions and extend to multi-turn conversations.
27 | | [Llama 3 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | [Magpie-Pro-DPO-100K](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-DPO-100K-v0.1) | DPO | DPO dataset via Best-of-N sampling and rewards.
28 | | [Llama 3 8B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | [Magpie-Air-3M](https://huggingface.co/datasets/Magpie-Align/Llama-3-Magpie-Air-3M-v0.1) | SFT | 3M Raw conversations built with Meta Llama 3 8B.
29 | | [Llama 3 8B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | [Magpie-Air-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Air-300K-Filtered) | SFT | Apply a filter and select 300K high quality data.
30 | | [Llama 3 8B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | [Magpie-Air-MT-300K](https://huggingface.co/datasets/Magpie-Align/Magpie-Air-MT-300K-v0.1) | SFT | Select 300K difficult questions and extend to multi-turn conversations.
31 | | [Llama 3 8B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | [Magpie-Air-DPO-100K](https://huggingface.co/datasets/Magpie-Align/Magpie-Air-DPO-100K-v0.1) | DPO | DPO dataset via Best-of-N sampling and rewards.
32 | 
33 | ### [**Qwen2.5**](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
34 | |Model Name | Dataset | Type | Description |
35 | |-------------|:-------|:-------|:-------|
36 | | [Qwen2.5 72B Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | [Magpie-Qwen2.5-Pro-1M](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2.5-Pro-1M-v0.1) | SFT | 1M Raw conversations built with Qwen2.5 72B Instruct.
37 | | [Qwen2.5 72B Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | [Magpie-Qwen2.5-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2.5-Pro-300K-Filtered) | SFT | Apply a filter and select 300K high quality conversations.
38 | 
39 | ### [**Qwen2**](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f)
40 | |Model Name | Dataset | Type | Description |
41 | |-------------|:-------|:-------|:-------|
42 | | [Qwen2 72B Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | [Magpie-Qwen2-Pro-1M](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-1M-v0.1) | SFT | 1M Raw conversations built with Qwen2 72B Instruct.
43 | | [Qwen2 72B Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | [Magpie-Qwen2-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-300K-Filtered) | SFT | Apply a filter and select 300K high quality conversations.
44 | | [Qwen2 72B Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | [Magpie-Qwen2-Pro-200K-Chinese](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-200K-Chinese) | SFT | Apply a filter and select 200K high quality Chinese conversations.
45 | | [Qwen2 72B Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | [Magpie-Qwen2-Pro-200K-English](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-200K-English) | SFT | Apply a filter and select 200K high quality English conversations.
46 | | [Qwen2 7B Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | [Magpie-Qwen2-Air-3M](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Air-3M-v0.1) | SFT | 3M Raw conversations built with Qwen2 7B Instruct.
47 | | [Qwen2 7B Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | [Magpie-Qwen2-Air-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen-Air-300K-Filtered) | SFT | Apply a filter and select 300K high quality conversations.
48 | 
49 | ### [**Phi-3**](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
50 | |Model Name | Dataset | Type | Description |
51 | |-------------|:-------|:-------|:-------|
52 | | [Phi-3 Medium Instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | [Magpie-Phi3-Pro-1M](https://huggingface.co/datasets/Magpie-Align/Magpie-Phi3-Pro-1M-v0.1) | SFT | 1M Raw conversations built with Phi-3 Medium Instruct.
53 | | [Phi-3 Medium Instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | [Magpie-Phi3-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Phi3-Pro-300K-Filtered) | SFT | Apply a filter and select 300K high quality conversations.
54 | 
55 | ### [**Gemma-2**](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
56 | |Model Name | Dataset | Type | Description |
57 | |-------------|:-------|:-------|:-------|
58 | | [Gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it) | [Magpie-Gemma2-Pro-534K](https://huggingface.co/datasets/Magpie-Align/Magpie-Gemma2-Pro-534K-v0.1) | SFT | 534K conversations built with Gemma-2-27b-it.
59 | | [Gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it) | [Magpie-Gemma2-Pro-200K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Gemma2-Pro-200K-Filtered) | SFT | Apply a filter and select 200K conversations.
60 | 
61 | ---
62 | 
63 | ## Domain Datasets
64 | 
65 | ### CoT Reasoning
66 | |Model | Dataset | Type | Description |
67 | |-------------|:-------|:-------|:-------|
68 | | Qwen2-72B-Instruct (instruction) + Llama-3-70B-Instruct (response) | [Magpie-Reasoning-V1-150K](https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-V1-150K) | SFT | 150K conversations built with Qwen2-72B-Instruct + Llama-3-70B-Instruct.
69 | | Qwen2-72B-Instruct (instruction) + QwQ-Preview (response) | [Magpie-Reasoning-V1-150K-CoT-QwQ](https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-V1-150K-CoT-QwQ) | SFT | 150K conversations built with Qwen2-72B-Instruct + QwQ-Preview.
70 | | Qwen2-72B-Instruct (instruction) + Skywork-O1-Llama-3.1-8B (response) | [Magpie-Reasoning-V1-150K-CoT-Skywork-O1-Llama-3.1-8B](https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-V1-150K-CoT-Skywork-O1-Llama-3.1-8B) | SFT | 150K conversations built with Qwen2-72B-Instruct + Skywork-O1-Llama-3.1-8B.
71 | | Qwen2-72B-Instruct (instruction) + QwQ-Preview (response) | [Magpie-Reasoning-V1-150K-CoT-Deepseek-R1-Llama-70B](https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-V1-150K-CoT-Deepseek-R1-Llama-70B) | SFT | 150K conversations built with Qwen2-72B-Instruct + Deepseek-R1-Llama-70B.
72 | | Llama3.1/3.3-70B-Instruct | [Magpie-Reasoning-V2-250K-CoT-Llama3](https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Llama3) | SFT | 250K conversations built with Llama3.1-70B-Instruct + Llama3.3-70B-Instruct.
73 | | Llama3.1/3.3-70B-Instruct (instruction) + QwQ-Preview (response) | [Magpie-Reasoning-V2-250K-CoT-QwQ](https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-V2-250K-CoT-QwQ) | SFT | 250K conversations built with Llama3.1/3.3-70B-Instruct + QwQ-Preview.
74 | | Llama3.1/3.3-70B-Instruct (instruction) + Skywork-O1-Llama-3.1-8B (response) | [Magpie-Reasoning-V2-250K-CoT-Skywork-O1-Llama-3.1-8B](https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Skywork-O1-Llama-3.1-8B) | SFT | 250K conversations built with Llama3.1/3.3-70B-Instruct + Skywork-O1-Llama-3.1-8B.
75 | | Llama3.1/3.3-70B-Instruct (instruction) + Deepseek-R1-Llama-70B (response) | [Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B](https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B) | SFT | 250K conversations built with Llama3.1/3.3-70B-Instruct + Deepseek-R1-Llama-70B.
76 | 
77 | 
78 | ### Coding & Debugging
79 | 
80 | |Model | Dataset | Type | Description |
81 | |-------------|:-------|:-------|:-------|
82 | | [Qwen2.5 Coder 32B Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) | [Magpie-Qwen2.5-Coder-Pro-300K-v0.1](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2.5-Coder-Pro-300K-v0.1) | SFT | 300K Raw conversations built with Qwen2.5 Coder 32B Instruct.
83 | 
84 | ### Math
85 | 
86 | |Model | Dataset | Type | Description |
87 | |-------------|:-------|:-------|:-------|
88 | | [Qwen2.5 Math 72B Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | [Magpie-Qwen2.5-Math-Pro-300K-v0.1](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2.5-Math-Pro-300K-v0.1) | SFT | 300K Raw conversations built with Qwen2.5 Math 72B Instruct.
89 | 


--------------------------------------------------------------------------------
/exp/gen_mt.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | import sys
  4 | import argparse
  5 | import json
  6 | import requests
  7 | import concurrent.futures
  8 | from time import sleep
  9 | from tqdm import tqdm
 10 | from transformers import AutoTokenizer
 11 | from utils import load_dataset_from_file, save_dataset, get_conversation_template
 12 | from vllm import LLM, SamplingParams
 13 | 
 14 | ################
 15 | # Configurations
 16 | ################
 17 | def get_args():
 18 |     # Experiment Settings
 19 |     parser = argparse.ArgumentParser(description="Response Generation Manager.")
 20 |     parser.add_argument("--model_path", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct",
 21 |                         help="We will support more models in the future.")
 22 |     parser.add_argument("--input_file", type=str, default=None, help="Input dataset file name")
 23 |     parser.add_argument("--num_turns", type=int, default=2, help="Number of turns for each conversation.")
 24 |     parser.add_argument("--batch_size", type=int, default=128, help="Number of samples per batch")
 25 |     parser.add_argument("--checkpoint_every", type=int, default=20, help="Save checkpoint every n batches")
 26 | 
 27 |     # Generation Parameters
 28 |     parser.add_argument("--device", type=str, default="0")
 29 |     parser.add_argument("--dtype", type=str, default="bfloat16", choices=["float16", "bfloat16"])
 30 |     parser.add_argument("--tensor_parallel_size", type=int, default=1, help="Number of GPUs to use for tensor parallelism. Only used for Llama 70B models.")
 31 |     parser.add_argument("--gpu_memory_utilization", type=float, default=0.95)
 32 |     parser.add_argument("--max_tokens", type=int, default=4096)
 33 |     parser.add_argument("--max_model_len", type=int, default=4096)
 34 |     parser.add_argument("--early_stopping", type=bool, default=True, help="Stop generation when the \n is generated.")
 35 |     parser.add_argument("--disable_early_stopping", action="store_false", dest="early_stopping", help="Disable early stopping.")
 36 |     parser.add_argument("--instruction_temperature", type=float, default=1.0)
 37 |     parser.add_argument("--instruction_top_p", type=float, default=1.0)
 38 |     parser.add_argument("--instruction_repetition_penalty", type=float, default=1.0)
 39 |     parser.add_argument("--response_temperature", type=float, default=0)
 40 |     parser.add_argument("--response_top_p", type=float, default=1.0)
 41 |     parser.add_argument("--response_repetition_penalty", type=float, default=1.0)
 42 |     parser.add_argument("--tokenizer_template", type=bool, default=False, help="Use tokenizer template for generating the response.")
 43 |     parser.add_argument("--use_tokenizer_template", action="store_true", dest="tokenizer_template")
 44 |     parser.add_argument("--debug", type=bool, default=False, help="Debug mode.")
 45 | 
 46 |     return parser.parse_args()
 47 | 
 48 | args = get_args()
 49 | print(f"Response Generation Manager. Arguments: {args}") # For logging
 50 | 
 51 | mt_system_prompt = "You are a helpful Al assistant. The user will engage in a multi-round conversation with you, asking initial questions and following up with additional related questions. Your goal is to provide thorough,relevant and insightful responses to help the user with their queries."
 52 | 
 53 | if args.input_file is None:
 54 |     raise ValueError("Please specify the input file path.")
 55 | if args.num_turns < 2:
 56 |     raise ValueError("Please specify a number of turns greater than 1.")
 57 | 
 58 | # Constants for the local vllm engine
 59 | MODEL_NAME = args.model_path
 60 | INPUT_FILE_NAME = args.input_file 
 61 | BATCH_SIZE = args.batch_size
 62 | CHECKPOINT_FILE = f"{INPUT_FILE_NAME[:INPUT_FILE_NAME.rfind('.')]}_mt_checkpoint.json"
 63 | CHECKPOINT_EVERY = args.checkpoint_every
 64 | SAVED_FILE = f"{INPUT_FILE_NAME[:INPUT_FILE_NAME.rfind('.')]}_mt.json"
 65 | 
 66 | # Obtain config from configs/model_configs.json
 67 | with open("../configs/model_configs.json", "r") as f:
 68 |     model_configs = json.load(f)
 69 |     model_config = model_configs[args.model_path]
 70 |     stop_tokens = model_config["stop_tokens"]
 71 |     stop_token_ids = model_config["stop_token_ids"]
 72 |     mt_append_template = model_config["mt_append_template"]
 73 | 
 74 |     if args.early_stopping:
 75 |         stop_tokens.append("\n")
 76 | 
 77 | # Process a batch of data using local vllm engine
 78 | def process_batch(batch, llm, instruction_params, response_params, tokenizer=None):
 79 |     # user_instructions = [item['instruction'] for item in batch]
 80 |     for turn in range(2, args.num_turns+1):
 81 |         print(f"Processing turn {turn}...")
 82 |         # Generate Instructions
 83 |         print(f"Generating instructions for turn {turn}...")
 84 |         prompts = []
 85 |         for item in batch:
 86 |             if not args.tokenizer_template:
 87 |                 conv = get_conversation_template(MODEL_NAME)
 88 |                 conv.system_message = mt_system_prompt
 89 |                 if turn == 2:
 90 |                     conv.append_message(conv.roles[0], item[f'instruction'])
 91 |                     conv.append_message(conv.roles[1], item[f'response'])
 92 |                 else:
 93 |                     conv.append_message(conv.roles[0], item[f'instruction'])
 94 |                     conv.append_message(conv.roles[1], item[f'response'])
 95 |                     for i in range(2, turn):
 96 |                         conv.append_message(conv.roles[0], item[f'instruction_{i}'])
 97 |                         conv.append_message(conv.roles[1], item[f'response_{i}'])
 98 |                 template = conv.get_prompt() + mt_append_template
 99 |             else:
100 |                 chat = []
101 |                 chat.append({"role": "system", "content": mt_system_prompt})
102 |                 if turn == 2:
103 |                     chat.append({"role": "user", "content": item[f'instruction']})
104 |                     chat.append({"role": "assistant", "content": item[f'response']})
105 |                 else:
106 |                     chat.append({"role": "user", "content": item[f'instruction']})
107 |                     chat.append({"role": "assistant", "content": item[f'response']})
108 |                     for i in range(2, turn):
109 |                         chat.append({"role": "user", "content": item[f'instruction_{i}']})
110 |                         chat.append({"role": "assistant", "content": item[f'response_{i}']})
111 |                 template = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) + mt_append_template
112 |             prompts.append(template)
113 |         outputs = llm.generate(prompts, instruction_params)
114 |         for i, item in enumerate(batch):
115 |             item[f'instruction_{turn}'] = outputs[i].outputs[0].text.strip()
116 | 
117 |         # Generate Responses
118 |         print(f"Generating responses for turn {turn}...")
119 |         prompts = []
120 |         for item in batch:
121 |             if not args.tokenizer_template:
122 |                 conv = get_conversation_template(MODEL_NAME)
123 |                 if turn == 2:
124 |                     conv.append_message(conv.roles[0], item[f'instruction'])
125 |                     conv.append_message(conv.roles[1], item[f'response'])
126 |                 else:
127 |                     conv.append_message(conv.roles[0], item[f'instruction'])
128 |                     conv.append_message(conv.roles[1], item[f'response'])
129 |                     for i in range(2, turn):
130 |                         conv.append_message(conv.roles[0], item[f'instruction_{i}'])
131 |                         conv.append_message(conv.roles[1], item[f'response_{i}'])
132 |                 conv.append_message(conv.roles[0], item[f'instruction_{turn}'])
133 |                 conv.append_message(conv.roles[1], None)
134 |                 template = conv.get_prompt() 
135 |             else:
136 |                 chat = []
137 |                 if turn == 2:
138 |                     chat.append({"role": "user", "content": item[f'instruction']})
139 |                     chat.append({"role": "assistant", "content": item[f'response']})
140 |                 else:
141 |                     chat.append({"role": "user", "content": item[f'instruction']})
142 |                     chat.append({"role": "assistant", "content": item[f'response']})
143 |                     for i in range(2, turn):
144 |                         chat.append({"role": "user", "content": item[f'instruction_{i}']})
145 |                         chat.append({"role": "assistant", "content": item[f'response_{i}']})
146 |                 chat.append({"role": "user", "content": item[f'instruction_{turn}']})
147 |                 template = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
148 |             prompts.append(template)
149 |         outputs = llm.generate(prompts, response_params)
150 |         for i, item in enumerate(batch):
151 |             item[f'response_{turn}'] = outputs[i].outputs[0].text.strip()
152 | 
153 |     return batch
154 | 
155 | # Generate outputs, update dataset in batches, and overwrite checkpoint
156 | def generate_and_update(dataset, llm=None, instruction_params=None, response_params=None, tokenizer=None):
157 | 
158 |     # Intialize the dataset with the checkpoint file (if it exists)
159 |     if os.path.exists(CHECKPOINT_FILE):
160 |         last_checkpoint_idx = len(load_dataset_from_file(CHECKPOINT_FILE))
161 |         print(f"Checkpoint file found. Resuming from last checkpoint with index {last_checkpoint_idx}.")
162 |         dataset[:last_checkpoint_idx] = load_dataset_from_file(CHECKPOINT_FILE)
163 |         # Calculate total number of batches
164 |         num_batches = (len(dataset) - last_checkpoint_idx + BATCH_SIZE - 1) // BATCH_SIZE
165 | 
166 |         print(f"Remaining number of batches: {num_batches}")
167 |     else:
168 |         last_checkpoint_idx = 0
169 |         # Calculate total number of batches
170 |         num_batches = (len(dataset) + BATCH_SIZE - 1) // BATCH_SIZE
171 |         print(f"Total number of batches: {num_batches}")
172 | 
173 |     for i in tqdm(range(num_batches)):
174 |         start_idx = i * BATCH_SIZE + last_checkpoint_idx
175 |         end_idx = min((i + 1) * BATCH_SIZE + last_checkpoint_idx, len(dataset))
176 |         batch = dataset[start_idx:end_idx]
177 |         batch = process_batch(batch, llm, instruction_params, response_params, tokenizer)
178 |         
179 |         dataset[start_idx:end_idx] = batch
180 |         # Overwrite the same checkpoint file after serveral batches
181 |         if i % CHECKPOINT_EVERY == 0:
182 |             save_dataset(dataset[:end_idx], CHECKPOINT_FILE)
183 |             print(f"Dataset checkpoint saved after batch {i + 1}.")
184 | 
185 |     return dataset
186 | 
187 | # Main function to control workflow
188 | def main():
189 |     # Load instructions from the input file
190 |     dataset = load_dataset_from_file(INPUT_FILE_NAME)
191 |     if args.debug:
192 |         dataset = dataset[:1] # For debugging
193 |     
194 |     # Set the device
195 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.device
196 |     print("Start Local vllm engine...")
197 |     llm =  LLM(model=MODEL_NAME, 
198 |         dtype=args.dtype,
199 |         trust_remote_code=True,
200 |         max_model_len = args.max_model_len, # limited by kv-cache 
201 |         tensor_parallel_size = args.tensor_parallel_size,
202 |         gpu_memory_utilization = args.gpu_memory_utilization)
203 | 
204 |     instruction_params = SamplingParams(
205 |         max_tokens=args.max_tokens,
206 |         temperature=args.instruction_temperature,
207 |         top_p=args.instruction_top_p,
208 |         repetition_penalty=args.instruction_repetition_penalty,
209 |         stop_token_ids=stop_token_ids,
210 |         stop=stop_tokens,
211 |         )
212 |     
213 |     response_params = SamplingParams(
214 |         max_tokens=args.max_tokens,
215 |         temperature=args.response_temperature,
216 |         top_p=args.response_top_p,
217 |         repetition_penalty=args.response_repetition_penalty,
218 |         stop_token_ids=stop_token_ids,
219 |         )
220 | 
221 |     updated_dataset = generate_and_update(dataset, llm, instruction_params, response_params, tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME))
222 | 
223 |     # Save final dataset
224 |     save_dataset(updated_dataset, SAVED_FILE)
225 | 
226 |     # Optionally remove the checkpoint file after completion
227 |     os.remove(CHECKPOINT_FILE)
228 |     print("Final dataset saved. Checkpoint removed.")
229 | 
230 | # Run the main function
231 | if __name__ == "__main__":
232 |     main()


--------------------------------------------------------------------------------
/exp/gen_res.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | import sys
  4 | import argparse
  5 | import json
  6 | import requests
  7 | import concurrent.futures
  8 | from time import sleep
  9 | from tqdm import tqdm
 10 | from utils import load_dataset_from_file, save_dataset, make_api_request_with_retry, get_conversation_template
 11 | from vllm import LLM, SamplingParams
 12 | from transformers import AutoTokenizer, AutoModelForCausalLM
 13 | 
 14 | ################
 15 | # Configurations
 16 | ################
 17 | def get_args():
 18 |     # Experiment Settings
 19 |     parser = argparse.ArgumentParser(description="Response Generation Manager.")
 20 |     parser.add_argument("--model_path", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct",
 21 |                         help="We will support more models in the future.")
 22 |     parser.add_argument("--input_file", type=str, default=None, help="Input dataset file name")
 23 |     parser.add_argument("--batch_size", type=int, default=128, help="Number of samples per batch")
 24 |     parser.add_argument("--checkpoint_every", type=int, default=20, help="Save checkpoint every n batches")
 25 |     parser.add_argument("--api_url", type=str, default="https://api.together.xyz/v1/chat/completions", help="API URL")
 26 |     parser.add_argument("--api_key", type=str, default=None, help="Together API Key")
 27 |     parser.add_argument("--offline", action="store_true", help="Use local engine")
 28 | 
 29 |     # Generation Parameters
 30 |     parser.add_argument('--engine', default="vllm", type=str, choices=["vllm", "hf", "together"])
 31 |     parser.add_argument("--device", type=str, default="0")
 32 |     parser.add_argument("--dtype", type=str, default="bfloat16", choices=["float16", "bfloat16"])
 33 |     parser.add_argument("--tensor_parallel_size", type=int, default=1, help="Number of GPUs to use for tensor parallelism. Only used for Llama 70B models.")
 34 |     parser.add_argument("--gpu_memory_utilization", type=float, default=0.95)
 35 |     parser.add_argument("--max_tokens", type=int, default=4096)
 36 |     parser.add_argument("--max_model_len", type=int, default=4096)
 37 |     parser.add_argument("--temperature", type=float, default=0)
 38 |     parser.add_argument("--top_p", type=float, default=1.0)
 39 |     parser.add_argument("--repetition_penalty", type=float, default=1.0)
 40 |     parser.add_argument("--tokenizer_template", type=bool, default=True, help="Use tokenizer template for generating the response.")
 41 |     parser.add_argument("--use_tokenizer_template", action="store_true", dest="tokenizer_template")
 42 | 
 43 |     return parser.parse_args()
 44 | 
 45 | args = get_args()
 46 | print(f"Response Generation Manager. Arguments: {args}") # For logging
 47 | 
 48 | if args.input_file is None:
 49 |     raise ValueError("Please specify the input file path.")
 50 | 
 51 | # Constants for the local vllm engine
 52 | MODEL_NAME = args.model_path
 53 | INPUT_FILE_NAME = args.input_file 
 54 | BATCH_SIZE = args.batch_size
 55 | CHECKPOINT_FILE = f"{INPUT_FILE_NAME[:INPUT_FILE_NAME.rfind('.')]}_res_checkpoint.json"
 56 | CHECKPOINT_EVERY = args.checkpoint_every
 57 | SAVED_FILE = f"{INPUT_FILE_NAME[:INPUT_FILE_NAME.rfind('.')]}_res.json"
 58 | 
 59 | # Obtain config from configs/model_configs.json
 60 | with open("../configs/model_configs.json", "r") as f:
 61 |     model_configs = json.load(f)
 62 |     model_config = model_configs[args.model_path]
 63 |     stop_tokens = model_config["stop_tokens"]
 64 |     stop_token_ids = model_config["stop_token_ids"]
 65 | 
 66 | # API Setups
 67 | if args.engine == "together":
 68 |     # Change name for API (Together Naming Convention)
 69 |     if MODEL_NAME == "meta-llama/Meta-Llama-3-8B-Instruct":
 70 |         api_model_name = "meta-llama/Llama-3-8b-chat-hf"
 71 |     elif MODEL_NAME == "meta-llama/Meta-Llama-3-70B-Instruct":
 72 |         api_model_name = "meta-llama/Llama-3-70b-chat-hf"
 73 |     else:
 74 |         api_model_name = MODEL_NAME
 75 | 
 76 |     # Constants for the API
 77 |     API_ENDPOINT = args.api_url
 78 |     API_HEADERS = {
 79 |         "Authorization": args.api_key,
 80 |     }
 81 |     API_PARAMS = {
 82 |         "model": api_model_name,
 83 |         "max_tokens": args.max_tokens,
 84 |         "temperature": args.temperature,
 85 |         "top_p": args.top_p,
 86 |         "repetition_penalty": args.repetition_penalty,
 87 |         "stop": stop_tokens
 88 |     }
 89 | 
 90 | # Process a batch of data using the API
 91 | def process_batch_with_api(batch):
 92 |     # with concurrent.futures.ThreadPoolExecutor() as executor:
 93 |     with concurrent.futures.ProcessPoolExecutor() as executor:
 94 |         future_to_item = {
 95 |             executor.submit(
 96 |                 make_api_request_with_retry, 
 97 |                 [{'content': item['instruction'], 'role': 'user'}],
 98 |                 API_PARAMS,
 99 |                 API_ENDPOINT,
100 |                 API_HEADERS,
101 |             ): item 
102 |             for item in batch
103 |         }
104 | 
105 |         for future in concurrent.futures.as_completed(future_to_item):
106 |             item = future_to_item[future]
107 |             try:
108 |                 api_response = future.result()
109 |                 item['response'] = api_response.strip()
110 |                 item['gen_response_configs'] = {
111 |                     "temperature": args.temperature,
112 |                     "top_p": args.top_p,
113 |                     "repetition_penalty": args.repetition_penalty,
114 |                     "max_tokens": args.max_tokens,
115 |                     "stop_tokens": stop_tokens,
116 |                     "output_generator": MODEL_NAME,
117 |                     "engine": api_model_name,
118 |                 }
119 |             except Exception as e:
120 |                 print(f"Failed to process item: {item} with error: {str(e)}")
121 |                 item['response'] = ""
122 |                 
123 |     return batch
124 | 
125 | # Process a batch of data using local vllm engine
126 | def process_batch(batch, llm, params, tokenizer=None):
127 |     user_instructions = [item['instruction'] for item in batch]
128 |     prompts = []
129 |     for instruction in user_instructions:
130 |         if not args.tokenizer_template:
131 |             conv = get_conversation_template(MODEL_NAME)
132 |             conv.append_message(conv.roles[0], instruction)
133 |             conv.append_message(conv.roles[1], None)
134 |             template = conv.get_prompt()
135 |         else:
136 |             chat = [{"role": "user", "content": instruction}]
137 |             template = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
138 |         prompts.append(template)
139 |     if args.engine == "vllm":
140 |         outputs = llm.generate(prompts, params)
141 |     elif args.engine == "hf":
142 |         inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(torch.cuda.current_device())
143 |         gen_do_sample = False if args.temperature == 0 else True
144 |         outputs = llm.generate(**inputs,
145 |                 tokenizer=tokenizer, 
146 |                 do_sample=gen_do_sample, 
147 |                 temperature=args.temperature if gen_do_sample else None, # To avoid temperature` (=0) has to be a strictly positive float
148 |                 top_p=args.top_p,
149 |                 repetition_penalty=args.repetition_penalty, 
150 |                 max_length=args.max_tokens,
151 |                 )
152 |         outputs = tokenizer.batch_decode(outputs[i][len(inputs[i]):] for i in range(len(outputs)))
153 |         # Setting stop tokens seems not working for Gemma, so we manually truncate the outputs
154 |         for i, completion in enumerate(outputs):
155 |             for stop_token in stop_tokens:
156 |                 if stop_token in completion:
157 |                     outputs[i] = completion[:completion.index(stop_token)]
158 | 
159 |     for i, item in enumerate(batch):
160 |         if args.engine == "vllm":
161 |             item['response'] = outputs[i].outputs[0].text.strip()
162 |         elif args.engine == "hf":
163 |             item['response'] = outputs[i].strip()
164 |         item['gen_response_configs'] = {
165 |             "prompt": prompts[i],
166 |             "temperature": args.temperature,
167 |             "top_p": args.top_p,
168 |             "repetition_penalty": args.repetition_penalty,
169 |             "max_tokens": args.max_tokens,
170 |             "stop_tokens": stop_tokens,
171 |             "output_generator": MODEL_NAME,
172 |             "engine": args.engine,
173 |         }
174 |     return batch
175 | 
176 | # Generate outputs, update dataset in batches, and overwrite checkpoint
177 | def generate_and_update(dataset, llm=None, params=None, tokenizer=None):
178 |     # Initialize tokenizer
179 |     if tokenizer is not None:
180 |         if tokenizer.pad_token_id is None:
181 |             tokenizer.pad_token = tokenizer.eos_token
182 |         if "gemma-2" in args.model_path.lower():
183 |             tokenizer.padding_side = "right"
184 | 
185 |     # Intialize the dataset with the checkpoint file (if it exists)
186 |     if os.path.exists(CHECKPOINT_FILE):
187 |         last_checkpoint_idx = len(load_dataset_from_file(CHECKPOINT_FILE))
188 |         print(f"Checkpoint file found. Resuming from last checkpoint with index {last_checkpoint_idx}.")
189 |         dataset[:last_checkpoint_idx] = load_dataset_from_file(CHECKPOINT_FILE)
190 |         # Calculate total number of batches
191 |         num_batches = (len(dataset) - last_checkpoint_idx + BATCH_SIZE - 1) // BATCH_SIZE
192 | 
193 |         print(f"Remaining number of batches: {num_batches}")
194 |     else:
195 |         last_checkpoint_idx = 0
196 |         # Calculate total number of batches
197 |         num_batches = (len(dataset) + BATCH_SIZE - 1) // BATCH_SIZE
198 |         print(f"Total number of batches: {num_batches}")
199 | 
200 |     for i in tqdm(range(num_batches)):
201 |         start_idx = i * BATCH_SIZE + last_checkpoint_idx
202 |         end_idx = min((i + 1) * BATCH_SIZE + last_checkpoint_idx, len(dataset))
203 |         batch = dataset[start_idx:end_idx]
204 |         if args.engine == "together":
205 |             batch = process_batch_with_api(batch)
206 |         else:
207 |             batch = process_batch(batch, llm, params, tokenizer)
208 |         
209 |         dataset[start_idx:end_idx] = batch
210 |         # Overwrite the same checkpoint file after serveral batches
211 |         if i % CHECKPOINT_EVERY == 0:
212 |             save_dataset(dataset[:end_idx], CHECKPOINT_FILE)
213 |             print(f"Dataset checkpoint saved after batch {i + 1}.")
214 | 
215 |     return dataset
216 | 
217 | # Main function to control workflow
218 | def main():
219 |     # Load instructions from the input file
220 |     dataset = load_dataset_from_file(INPUT_FILE_NAME)
221 |     
222 |     if args.engine == "together":
223 |         print("Start together API engine...")
224 |         llm = None
225 |         params = None
226 |         tokenizer = None
227 |     elif args.engine == "vllm":
228 |         # Set the device
229 |         os.environ["CUDA_VISIBLE_DEVICES"] = args.device
230 |         print("Start Local vllm engine...")
231 |         llm = LLM(model=MODEL_NAME, 
232 |             dtype=args.dtype,
233 |             trust_remote_code=True,
234 |             max_model_len = args.max_model_len, # limited by kv-cache 
235 |             tensor_parallel_size = args.tensor_parallel_size,
236 |             gpu_memory_utilization = args.gpu_memory_utilization)
237 |     
238 |         params = SamplingParams(
239 |             max_tokens=args.max_tokens,
240 |             temperature=args.temperature,
241 |             top_p=args.top_p,
242 |             repetition_penalty=args.repetition_penalty,
243 |             stop_token_ids=stop_token_ids,
244 |             )
245 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path)
246 |     elif args.engine == "hf":
247 |         print("Start Hugging Face engine...")
248 |         params = None
249 |         # Load the model and tokenizer
250 |         llm = AutoModelForCausalLM.from_pretrained(
251 |             args.model_path,
252 |             device_map={'':torch.cuda.current_device()},
253 |             torch_dtype=torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
254 |         )
255 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path)
256 |     else:
257 |         raise ValueError("Invalid engine type.")
258 | 
259 |     updated_dataset = generate_and_update(dataset, llm, params, tokenizer=tokenizer)
260 | 
261 |     # Save final dataset
262 |     save_dataset(updated_dataset, SAVED_FILE)
263 | 
264 |     # Optionally remove the checkpoint file after completion
265 |     os.remove(CHECKPOINT_FILE)
266 |     print("Final dataset saved. Checkpoint removed.")
267 | 
268 | # Run the main function
269 | if __name__ == "__main__":
270 |     main()


--------------------------------------------------------------------------------
/exp/gen_po_multi_res.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | import sys
  4 | import argparse
  5 | import json
  6 | import requests
  7 | import concurrent.futures
  8 | from time import sleep
  9 | from tqdm import tqdm
 10 | from transformers import AutoTokenizer
 11 | from utils import load_dataset_from_file, save_dataset, make_api_request_with_retry, get_conversation_template
 12 | from vllm import LLM, SamplingParams
 13 | from transformers import AutoTokenizer, AutoModelForCausalLM
 14 | 
 15 | ################
 16 | # Configurations
 17 | ################
 18 | def get_args():
 19 |     # Experiment Settings
 20 |     parser = argparse.ArgumentParser(description="Response Generation Manager.")
 21 |     parser.add_argument("--model_path", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct",
 22 |                         help="We will support more models in the future.")
 23 |     parser.add_argument("--input_file", type=str, default=None, help="Input dataset file name")
 24 |     parser.add_argument("--batch_size", type=int, default=128, help="Number of samples per batch")
 25 |     parser.add_argument("--checkpoint_every", type=int, default=20, help="Save checkpoint every n batches")
 26 |     parser.add_argument("--api_url", type=str, default="https://api.together.xyz/v1/chat/completions", help="API URL")
 27 |     parser.add_argument("--api_key", type=str, default=None, help="Together API Key")
 28 |     parser.add_argument("--offline", action="store_true", help="Use local engine")
 29 | 
 30 |     # Generation Parameters
 31 |     parser.add_argument('--engine', default="vllm", type=str, choices=["vllm", "hf", "together"])
 32 |     parser.add_argument("--device", type=str, default="0")
 33 |     parser.add_argument("--dtype", type=str, default="bfloat16", choices=["float16", "bfloat16"])
 34 |     parser.add_argument("--tensor_parallel_size", type=int, default=1, help="Number of GPUs to use for tensor parallelism. Only used for Llama 70B models.")
 35 |     parser.add_argument("--gpu_memory_utilization", type=float, default=0.95)
 36 |     parser.add_argument("--max_tokens", type=int, default=4096)
 37 |     parser.add_argument("--max_model_len", type=int, default=4096)
 38 |     parser.add_argument("--temperature", type=float, default=0.8)
 39 |     parser.add_argument("--top_p", type=float, default=1.0)
 40 |     parser.add_argument("--repetition_penalty", type=float, default=1.0)
 41 |     parser.add_argument("--num_samples", type=int, default=5)
 42 |     parser.add_argument("--tokenizer_template", type=bool, default=False, help="Use tokenizer template for generating the response.")
 43 |     parser.add_argument("--use_tokenizer_template", action="store_true", dest="tokenizer_template")
 44 | 
 45 |     return parser.parse_args()
 46 | 
 47 | args = get_args()
 48 | print(f"Response Generation Manager. Arguments: {args}") # For logging
 49 | 
 50 | if args.input_file is None:
 51 |     raise ValueError("Please specify the input file path.")
 52 | 
 53 | # Constants for the local vllm engine
 54 | MODEL_NAME = args.model_path
 55 | INPUT_FILE_NAME = args.input_file 
 56 | BATCH_SIZE = args.batch_size
 57 | CHECKPOINT_FILE = f"{INPUT_FILE_NAME[:INPUT_FILE_NAME.rfind('.')]}_{args.num_samples}res_checkpoint.json"
 58 | CHECKPOINT_EVERY = args.checkpoint_every
 59 | SAVED_FILE = f"{INPUT_FILE_NAME[:INPUT_FILE_NAME.rfind('.')]}_{args.num_samples}res.json"
 60 | 
 61 | # Obtain config from configs/model_configs.json
 62 | with open("../configs/model_configs.json", "r") as f:
 63 |     model_configs = json.load(f)
 64 |     model_config = model_configs[args.model_path]
 65 |     stop_tokens = model_config["stop_tokens"]
 66 |     stop_token_ids = model_config["stop_token_ids"]
 67 | 
 68 | # API Setups
 69 | if args.engine == "together":
 70 |     # Change name for API (Together Naming Convention)
 71 |     if MODEL_NAME == "meta-llama/Meta-Llama-3-8B-Instruct":
 72 |         api_model_name = "meta-llama/Llama-3-8b-chat-hf"
 73 |     elif MODEL_NAME == "meta-llama/Meta-Llama-3-70B-Instruct":
 74 |         api_model_name = "meta-llama/Llama-3-70b-chat-hf"
 75 |     else:
 76 |         api_model_name = MODEL_NAME
 77 | 
 78 |     # Constants for the API
 79 |     API_ENDPOINT = args.api_url
 80 |     API_HEADERS = {
 81 |         "Authorization": args.api_key,
 82 |     }
 83 |     API_PARAMS = {
 84 |         "model": api_model_name,
 85 |         "max_tokens": args.max_tokens,
 86 |         "temperature": args.temperature,
 87 |         "top_p": args.top_p,
 88 |         "repetition_penalty": args.repetition_penalty,
 89 |         "stop": stop_tokens
 90 |     }
 91 | 
 92 | # Process a batch of data using the API
 93 | def process_batch_with_api(batch):
 94 |     # with concurrent.futures.ThreadPoolExecutor() as executor:
 95 |     with concurrent.futures.ProcessPoolExecutor() as executor:
 96 |         future_to_item = {
 97 |             executor.submit(
 98 |                 make_api_request_with_retry, 
 99 |                 [{'content': item['instruction'], 'role': 'user'}],
100 |                 API_PARAMS,
101 |                 API_ENDPOINT,
102 |                 API_HEADERS,
103 |             ): item 
104 |             for item in batch
105 |         }
106 | 
107 |         for future in concurrent.futures.as_completed(future_to_item):
108 |             item = future_to_item[future]
109 |             try:
110 |                 api_responses = []
111 |                 for _ in range(args.num_samples):  # Generate multiple samples
112 |                     api_response = future.result()
113 |                     api_responses.append(api_response.strip())
114 |                 item['responses'] = api_responses
115 |                 item['gen_response_configs'] = {
116 |                     "temperature": args.temperature,
117 |                     "top_p": args.top_p,
118 |                     "repetition_penalty": args.repetition_penalty,
119 |                     "max_tokens": args.max_tokens,
120 |                     "stop_tokens": stop_tokens,
121 |                     "output_generator": MODEL_NAME,
122 |                     "engine": api_model_name,
123 |                 }
124 |             except Exception as e:
125 |                 print(f"Failed to process item: {item} with error: {str(e)}")
126 |                 item['responses'] = []
127 |                 
128 |     return batch
129 | 
130 | # Process a batch of data using local vllm engine
131 | def process_batch(batch, llm, params, tokenizer=None):
132 |     user_instructions = [item['instruction'] for item in batch]
133 |     prompts = []
134 |     for instruction in user_instructions:
135 |         if not args.tokenizer_template:
136 |             conv = get_conversation_template(MODEL_NAME)
137 |             conv.append_message(conv.roles[0], instruction)
138 |             conv.append_message(conv.roles[1], None)
139 |             template = conv.get_prompt()
140 |         else:
141 |             chat = [{"role": "user", "content": instruction}]
142 |             template = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
143 |         prompts.append(template)
144 | 
145 |     all_outputs = []
146 |     if args.engine == "vllm":
147 |         for _ in range(args.num_samples):  # Generate multiple samples
148 |             outputs = llm.generate(prompts, params)
149 |             all_outputs.append(outputs)
150 |     elif args.engine == "hf":
151 |         for _ in range(args.num_samples):  # Generate multiple samples
152 |             inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(torch.cuda.current_device())
153 |             gen_do_sample = False if args.temperature == 0 else True
154 |             outputs = llm.generate(**inputs,
155 |                     tokenizer=tokenizer, 
156 |                     do_sample=gen_do_sample, 
157 |                     temperature=args.temperature if gen_do_sample else None,
158 |                     top_p=args.top_p,
159 |                     repetition_penalty=args.repetition_penalty, 
160 |                     max_length=args.max_tokens,
161 |                     )
162 |             outputs = tokenizer.batch_decode(outputs[i][len(inputs[i]):] for i in range(len(outputs)))
163 |             # Setting stop tokens seems not working for Gemma, so we manually truncate the outputs
164 |             for i, completion in enumerate(outputs):
165 |                 for stop_token in stop_tokens:
166 |                     if stop_token in completion:
167 |                         outputs[i] = completion[:completion.index(stop_token)]
168 |             all_outputs.append(outputs)
169 | 
170 |     for i, item in enumerate(batch):
171 |         item['responses'] = [] 
172 |         if args.engine == "vllm":
173 |             for output in all_outputs:
174 |                 item['responses'].append(output[i].outputs[0].text.strip())
175 |         elif args.engine == "hf":
176 |             for output in all_outputs:
177 |                 item['responses'].append(output[i].strip())
178 |         item['gen_response_configs'] = {
179 |             "prompt": prompts[i],
180 |             "temperature": args.temperature,
181 |             "top_p": args.top_p,
182 |             "repetition_penalty": args.repetition_penalty,
183 |             "max_tokens": args.max_tokens,
184 |             "stop_tokens": stop_tokens,
185 |             "output_generator": MODEL_NAME,
186 |             "engine": args.engine,
187 |         }
188 |     return batch
189 | 
190 | # Generate outputs, update dataset in batches, and overwrite checkpoint
191 | def generate_and_update(dataset, llm=None, params=None, tokenizer=None):
192 |     # Initialize tokenizer
193 |     if tokenizer is not None:
194 |         if tokenizer.pad_token_id is None:
195 |             tokenizer.pad_token = tokenizer.eos_token
196 |         if "gemma-2" in args.model_path.lower():
197 |             tokenizer.padding_side = "right"
198 | 
199 |     # Intialize the dataset with the checkpoint file (if it exists)
200 |     if os.path.exists(CHECKPOINT_FILE):
201 |         last_checkpoint_idx = len(load_dataset_from_file(CHECKPOINT_FILE))
202 |         print(f"Checkpoint file found. Resuming from last checkpoint with index {last_checkpoint_idx}.")
203 |         dataset[:last_checkpoint_idx] = load_dataset_from_file(CHECKPOINT_FILE)
204 |         # Calculate total number of batches
205 |         num_batches = (len(dataset) - last_checkpoint_idx + BATCH_SIZE - 1) // BATCH_SIZE
206 | 
207 |         print(f"Remaining number of batches: {num_batches}")
208 |     else:
209 |         last_checkpoint_idx = 0
210 |         # Calculate total number of batches
211 |         num_batches = (len(dataset) + BATCH_SIZE - 1) // BATCH_SIZE
212 |         print(f"Total number of batches: {num_batches}")
213 | 
214 |     for i in tqdm(range(num_batches)):
215 |         start_idx = i * BATCH_SIZE + last_checkpoint_idx
216 |         end_idx = min((i + 1) * BATCH_SIZE + last_checkpoint_idx, len(dataset))
217 |         batch = dataset[start_idx:end_idx]
218 |         if args.engine == "together":
219 |             batch = process_batch_with_api(batch)
220 |         else:
221 |             batch = process_batch(batch, llm, params, tokenizer)
222 |         
223 |         dataset[start_idx:end_idx] = batch
224 |         # Overwrite the same checkpoint file after serveral batches
225 |         if i % CHECKPOINT_EVERY == 0:
226 |             save_dataset(dataset[:end_idx], CHECKPOINT_FILE)
227 |             print(f"Dataset checkpoint saved after batch {i + 1}.")
228 | 
229 |     return dataset
230 | 
231 | # Main function to control workflow
232 | def main():
233 |     # Load instructions from the input file
234 |     dataset = load_dataset_from_file(INPUT_FILE_NAME)
235 |     
236 |     if args.engine == "together":
237 |         print("Start together API engine...")
238 |         llm = None
239 |         params = None
240 |         tokenizer = None
241 |     elif args.engine == "vllm":
242 |         # Set the device
243 |         os.environ["CUDA_VISIBLE_DEVICES"] = args.device
244 |         print("Start Local vllm engine...")
245 |         llm = LLM(model=MODEL_NAME, 
246 |             dtype=args.dtype,
247 |             trust_remote_code=True,
248 |             max_model_len = args.max_model_len, # limited by kv-cache 
249 |             tensor_parallel_size = args.tensor_parallel_size,
250 |             gpu_memory_utilization = args.gpu_memory_utilization)
251 |     
252 |         params = SamplingParams(
253 |             max_tokens=args.max_tokens,
254 |             temperature=args.temperature,
255 |             top_p=args.top_p,
256 |             repetition_penalty=args.repetition_penalty,
257 |             stop_token_ids=stop_token_ids,
258 |             )
259 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path)
260 |     elif args.engine == "hf":
261 |         print("Start Hugging Face engine...")
262 |         params = None
263 |         # Load the model and tokenizer
264 |         llm = AutoModelForCausalLM.from_pretrained(
265 |             args.model_path,
266 |             device_map={'':torch.cuda.current_device()},
267 |             torch_dtype=torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
268 |         )
269 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path)
270 |     else:
271 |         raise ValueError("Invalid engine type.")
272 | 
273 |     updated_dataset = generate_and_update(dataset, llm, params, tokenizer=tokenizer)
274 | 
275 |     # Save final dataset
276 |     save_dataset(updated_dataset, SAVED_FILE)
277 | 
278 |     # Optionally remove the checkpoint file after completion
279 |     os.remove(CHECKPOINT_FILE)
280 |     print("Final dataset saved. Checkpoint removed.")
281 | 
282 | # Run the main function
283 | if __name__ == "__main__":
284 |     main()


--------------------------------------------------------------------------------
/exp/gen_ins.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | import sys
  4 | import argparse
  5 | import json
  6 | import time
  7 | import random
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 11 | from vllm import LLM, SamplingParams
 12 | from transformers import AutoTokenizer, AutoModelForCausalLM
 13 | from str_utils import de_md_logits_processor_for_llama3_1, flaming_tokens
 14 | import str_utils
 15 | 
 16 | ################
 17 | # Configurations
 18 | ################
 19 | def get_args():
 20 |     # Experiment Settings
 21 |     parser = argparse.ArgumentParser(description="Instruction Generation Manager.")
 22 |     parser.add_argument("--model_path", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct",
 23 |                         help="We will support more models in the future.")
 24 |     
 25 |     # Generation Parameters
 26 |     parser.add_argument("--temperature", type=float, default=1.0)
 27 |     parser.add_argument("--top_p", type=float, default=1.0)
 28 |     parser.add_argument("--n", type=int, default=200, help="Number of samples to generate for one time.")
 29 |     parser.add_argument("--repeat", type=int, default=None, help="Number of times to repeat the instruction generation. Only available when total prompts is not specified.")
 30 |     parser.add_argument("--total_prompts", type=int, default=1000, help="Total number of prompts to generate. If specified, repeat will be ignored.")
 31 |     parser.add_argument("--max_tokens", type=int, default=2048)
 32 |     parser.add_argument("--max_model_len", type=int, default=4096)
 33 | 
 34 |     # Generation Settings
 35 |     parser.add_argument("--early_stopping", type=bool, default=True, help="Stop generation when the \n is generated.")
 36 |     parser.add_argument("--disable_early_stopping", action="store_false", dest="early_stopping", help="Disable early stopping.")
 37 |     parser.add_argument("--system_prompt", action="store_true", help="Enable system prompt for extracting the input.")
 38 |     parser.add_argument("--sanitize", action="store_true", help="Sanitize the generated instructions. Only available for Gemma and Llama-3 models.")
 39 |     parser.add_argument("--logits_processor", action="store_true", help="Enable logits processor for the generation.")
 40 |     parser.add_argument("--flaming_tokens", action="store_true", help="Enable flaming initial tokens (increase temperature) for more diverse generation.")
 41 |     parser.add_argument("--control_tasks", type=str, default=None, choices=[None, "translation", "code", "math"],  help="Control tasks for the generation. Currently only available for some models.")
 42 |     parser.add_argument("--shuffle", type=bool, default=True, help="Shuffle the outputs generated by vllm.")
 43 |     parser.add_argument("--skip_special_tokens", type=bool, default=True)
 44 | 
 45 |     # System Settings
 46 |     parser.add_argument('--engine', default="vllm", type=str, choices=["vllm", "hf"])
 47 |     parser.add_argument("--device", type=str, default="0")
 48 |     parser.add_argument("--dtype", type=str, default="bfloat16", choices=["float16", "bfloat16"])
 49 |     parser.add_argument("--tensor_parallel_size", type=int, default=1, help="Number of GPUs to use for tensor parallelism. Only used for Llama 70B models.")
 50 |     parser.add_argument("--gpu_memory_utilization", type=float, default=0.95)
 51 |     parser.add_argument("--swap_space", type=float, default=2.0)
 52 |     parser.add_argument("--checkpoint_every", type=int, default=100, help="Save checkpoint every n repeats.")
 53 |     parser.add_argument("--output_folder", type=str, default="../data")
 54 |     parser.add_argument("--job_name", type=str, default=None, help="Job Name. Get from the script.")
 55 |     parser.add_argument("--timestamp", type=int, default=int(time.time()), help="Timestamp for the job. Also used as the random seed.")
 56 |     parser.add_argument("--seed", type=int, default=None, help="Random seed.")
 57 | 
 58 |     return parser.parse_args()
 59 | 
 60 | # Main function to control workflow
 61 | def main():
 62 |     args = get_args()
 63 |     print(f"Instruction Generation Manager. Arguments: {args}") # For logging
 64 | 
 65 |     # Raise error if sanitization is requested for unsupported models
 66 |     if args.sanitize:
 67 |         if not ("gemma" in args.model_path.lower() or "llama-3" in args.model_path.lower()):
 68 |             raise ValueError("Sanitization is only supported for Gemma and Llama-3 models.")
 69 |     
 70 |     if args.total_prompts is None:
 71 |         if args.repeat is None:
 72 |             raise ValueError("Either total prompts or repeat should be specified.")
 73 |         args.total_prompts = args.repeat * args.n
 74 |     else:
 75 |         # If total prompts is specified, repeat will be ignored
 76 |         args.repeat = int(np.ceil(args.total_prompts / args.n))
 77 |     
 78 |     # Set the random seed for NumPy
 79 |     if args.seed is not None:
 80 |         np.random.seed(args.seed)
 81 |         # Set the random seed for PyTorch
 82 |         torch.manual_seed(args.seed)
 83 |         # If you are using CUDA (i.e., a GPU), also set the seed for it
 84 |         torch.cuda.manual_seed_all(args.seed)
 85 |     
 86 |     # Create output file / folder
 87 |     output_filename = f"Magpie_{args.model_path.split('/')[-1]}_{args.total_prompts}_{args.timestamp}_ins.json"
 88 |     if not args.job_name:
 89 |         if not os.path.exists(args.output_folder):
 90 |             os.makedirs(args.output_folder)
 91 |         output_dir = f"{args.output_folder}/{output_filename}"
 92 |     else:
 93 |         output_dir = f"{args.output_folder}/{args.job_name}/{output_filename}"
 94 |     
 95 |     # Set the device
 96 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.device
 97 |     # Set generation engine
 98 |     if args.engine == "vllm":
 99 |         # Create vllm instance  
100 |         llm = LLM(model=args.model_path, 
101 |                 dtype=args.dtype,
102 |                 trust_remote_code=True,
103 |                 gpu_memory_utilization=args.gpu_memory_utilization,
104 |                 max_model_len=args.max_model_len,
105 |                 swap_space=args.swap_space,
106 |                 tensor_parallel_size=args.tensor_parallel_size,
107 |                 seed=args.seed if args.seed is not None else args.timestamp,
108 |                 enable_prefix_caching=True)
109 |     elif args.engine == "hf":
110 |         # Load the model and tokenizer
111 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path)
112 |         model = AutoModelForCausalLM.from_pretrained(
113 |             args.model_path,
114 |             device_map={'':torch.cuda.current_device()},
115 |             torch_dtype=torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
116 |         )
117 |     
118 |     
119 |     # Obtain config from configs/model_configs.json
120 |     with open("../configs/model_configs.json", "r", encoding="utf-8") as f:
121 |         model_configs = json.load(f)
122 |         model_config = model_configs[args.model_path]
123 |         if args.control_tasks:
124 |             pre_query_template = model_config[f"pre_query_template_{args.control_tasks}"]
125 |             print("Control task: {args.control_tasks}")
126 |         elif args.system_prompt:
127 |             pre_query_template = model_config["pre_query_template_with_system_prompt"]
128 |             print("System prompt enabled. Warning: The system prompt may degrade the performance.")
129 |         else:
130 |             pre_query_template = model_config["pre_query_template"]
131 |         stop_tokens = model_config["stop_tokens"]
132 |         stop_tokens_assistant = model_config["stop_tokens_assistant"]
133 |         stop_tokens += stop_tokens_assistant
134 |         stop_token_ids = model_config["stop_token_ids"]
135 |     
136 |         # Process early stopping. We found that sometimes LLM will generate responses immediately after the \n token.
137 |         if args.early_stopping:
138 |             stop_tokens.append("\n")
139 |     
140 |         print(f"Pre-query template: {pre_query_template}")
141 |         print(f"Stop tokens: {stop_tokens}")
142 |         print(f"Stop token ids: {stop_token_ids}")
143 |     
144 |     # Apply logits processors
145 |     if args.logits_processor and args.flaming_tokens:
146 |         raise ValueError("Cannot enable both logits processor and flaming tokens")
147 |     
148 |     if args.logits_processor and "llama-3.1" in args.model_path.lower():
149 |         logits_processor = de_md_logits_processor_for_llama3_1
150 |         print(f"Logits processor applied: {logits_processor}")
151 |     elif args.flaming_tokens:
152 |         logits_processor = flaming_tokens
153 |         print(f"Logits processor applied: {logits_processor}")
154 |     else:
155 |         logits_processor = None
156 |         
157 |     # Define sampling parameters
158 |     sampling_params = SamplingParams(
159 |         n=args.n,
160 |         temperature=args.temperature,
161 |         top_p=args.top_p,
162 |         max_tokens=args.max_tokens,
163 |         skip_special_tokens=args.skip_special_tokens,
164 |         stop=stop_tokens,
165 |         stop_token_ids=stop_token_ids,
166 |         logits_processors=[logits_processor] if logits_processor else None
167 |     )
168 |     
169 |     ################
170 |     # Generate outputs
171 |     ################
172 |     results = []
173 |     for rounds in tqdm(range(args.repeat)):
174 |         # Generate outputs
175 |         if args.engine == "vllm":
176 |             output = llm.generate(pre_query_template, sampling_params)
177 |             output_list = output[0].outputs
178 |             if args.shuffle:
179 |                 random.shuffle(output_list)
180 |         
181 |         elif args.engine == "hf":
182 |             input = tokenizer.encode(pre_query_template, add_special_tokens=False, return_tensors="pt").to(torch.cuda.current_device())
183 |             # Gemma-2 bug, so we cannot set num_return_sequences > 1. 
184 |             # Instead, we repeat the input n times.
185 |             inputs = input.repeat(args.n, 1).to(torch.cuda.current_device())
186 |             output = model.generate(inputs,
187 |                                     tokenizer=tokenizer, 
188 |                                     do_sample=True, 
189 |                                     temperature=args.temperature, 
190 |                                     top_p=args.top_p, 
191 |                                     max_length=args.max_tokens, 
192 |                                     num_return_sequences=1,
193 |                                     )
194 |             # Remove the input from the output
195 |             output_list = tokenizer.batch_decode(output[i][len(inputs[0]):] for i in range(args.n))
196 |             # Stop on the first stop token
197 |             for i, completion in enumerate(output_list):
198 |                 for stop_token in stop_tokens:
199 |                     if stop_token in completion:
200 |                         output_list[i] = completion[:completion.index(stop_token)]
201 |                                                  
202 |         # Save outputs
203 |         for i, completion in enumerate(output_list):
204 |             if args.engine == "vllm":
205 |                 instruction = completion.text.strip()
206 |             elif args.engine == "hf":
207 |                 instruction = completion.strip()
208 |     
209 |             if args.sanitize:
210 |                 sanitized_instruction, class_num = str_utils.instruction_post_process(instruction, args.model_path)
211 |                 result = {
212 |                     "id": rounds * args.n + i,
213 |                     "pre_query_template": f"{pre_query_template}",
214 |                     "raw_instruction": instruction,
215 |                     "instruction": sanitized_instruction,
216 |                     "instruction_sanitize_class_num": class_num,
217 |                     "response": None,
218 |                     "created": int(time.time()),
219 |                     "gen_input_configs": {
220 |                         "temperature": args.temperature,
221 |                         "top_p": args.top_p,
222 |                         "input_generator": f"{args.model_path}",
223 |                         "seed": args.seed,
224 |                     },
225 |                     "gen_response_configs": None,
226 |                 }
227 |             else:
228 |                 result = {
229 |                     "id": rounds * args.n + i,
230 |                     "pre_query_template": f"{pre_query_template}",
231 |                     "instruction": instruction,
232 |                     "response": None,
233 |                     "created": int(time.time()),
234 |                     "gen_input_configs": {
235 |                         "temperature": args.temperature,
236 |                         "top_p": args.top_p,
237 |                         "input_generator": f"{args.model_path}",
238 |                         "seed": args.seed,
239 |                     },
240 |                     "gen_response_configs": None,
241 |                 }
242 |             results.append(result)
243 |     
244 |         # Save the checkpoints every args.checkpoint_every rounds
245 |         if rounds % args.checkpoint_every == 0:
246 |             with open(output_dir, "w") as f:
247 |                 json.dump(results, f, indent=2)
248 |             print(f"Checkpoint saved. Total prompts: {len(results)}")
249 |     
250 |     # Save the final results
251 |     with open(output_dir, "w") as f:
252 |         json.dump(results, f, indent=2)
253 |     
254 |     print(f"Instruction generated from {args.model_path}. Total prompts: {len(results)}")
255 | 
256 | # Run the main function
257 | if __name__ == "__main__":
258 |     main()
259 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- # 🐦 Magpie -->
  2 | 
  3 | [![Magpie](figs/magpie_logo.png)](https://magpie-align.github.io/)
  4 | 
  5 | [![arXiv](https://img.shields.io/badge/arXiv-paper-b31b1b.svg)](https://arxiv.org/abs/2406.08464) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Data License](https://img.shields.io/badge/Data%20License-CC%20By%20NC%204.0-red.svg)](https://huggingface.co/Magpie-Align) [![Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/spaces/davanstrien/magpie)
  6 | 
  7 | This is the official repository for ICLR 2025 paper "[Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464)". Magpie generates high-quality alignment data by prompting aligned LLMs with their pre-query templates. Unlike many existing synthetic data generation methods, Magpie doesn't rely on prompt engineering or seed questions for generating synthetic data. Instead, it uses the prompt template of an aligned LLM to generate both the user query and an LLM response.
  8 | 
  9 | - 🤗 [**Huggingface (Models and Datasets)**](https://huggingface.co/Magpie-Align)
 10 | - 🧭 [**Dataset Navigation**](navigation.md)
 11 | - 🕸️ [**Website**](https://magpie-align.github.io/)
 12 | - 📄 [**Technical Report**](https://arxiv.org/abs/2406.08464)
 13 | - 🤗 [**Magpie Demo**](https://huggingface.co/spaces/davanstrien/magpie) (Thanks a lot for the implementation from @davanstrien!)
 14 | - 🐦 [**Chat with Magpie**](https://huggingface.co/spaces/flydust/Chat-with-Magpie)
 15 | 
 16 | ## 🐦 News
 17 | - [2025/01/22] Magpie paper is accepted by ICLR 2025! 
 18 | - [2025/01/09] Magpie Reasoning V2 dataset is out! [250K]([https://huggiK](https://huggingface.co/collections/Magpie-Align/magpie-reasoning-datasets-67790a13b91035bc42693885)) from Llama, Skywork-o1 and QwQ! This time, we focus on CoT 🤯
 19 | - [2025/01/01] Magpie Llama-3.3 dataset is out! [1M](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.3-Pro-1M-v0.1) from Llama-3.3-70B-Instruct! Happy New Year!
 20 | - [2024/10/20] Magpie Qwen2.5 dataset is out! [1M](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2.5-Pro-1M-v0.1) from Qwen2.5 72B!
 21 | - [2024/09/17] Ship two new models with SOTA performance: 𝙼𝚊𝚐𝚙𝚒𝚎𝙻𝙼-𝙲𝚑𝚊𝚝 (4B & 8B)! See collection [here](https://huggingface.co/collections/Magpie-Align/magpielm-66e2221f31fa3bf05b10786a)!
 22 | - [2024/08/19] Three preference optimization datasets, [Magpie-Air-DPO-100K-v0.1](https://huggingface.co/datasets/Magpie-Align/Magpie-Air-DPO-100K-v0.1), [Magpie-Pro-DPO-100K-v0.1](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-DPO-100K-v0.1), and [Magpie-Llama-3.1-Pro-DPO-100K-v0.1](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-DPO-100K-v0.1) are out! 
 23 | - [2024/07/25] Magpie Llama-3.1 dataset is out! [1M](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-1M-v0.1) from Llama-3.1-70B-Instruct! More friendly license compared with Llama-3 😃!
 24 | - [2024/07/21] Magpie Gemma2 dataset is out! [534K](https://huggingface.co/collections/Magpie-Align/magpie-gemma2-datasets-669da6aff21b09fdcecbd6ea) from Gemma-2-27b-it!
 25 | - [2024/07/19] [Llama-3-8B-Magpie-Align-v0.3](https://huggingface.co/Magpie-Align/Llama-3-8B-Magpie-Align-v0.3) is out with enhanced Chinese question-answering ability, thanks to our new [Chinese instruction dataset](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-200K-Chinese)!
 26 | - [2024/07/14] [Llama-3-8B-Magpie-Align-v0.2](https://huggingface.co/Magpie-Align/Llama-3-8B-Magpie-Align-v0.2) is out with enhanced reasoning ability, thanks to our new [reasoning booster dataset](https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K)!
 27 | - [2024/07/04] Magpie Qwen2 dataset is out! [1M](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-1M-v0.1) from Qwen2 72B and [3M](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Air-3M-v0.1) from Qwen2 7B.
 28 | - [2024/07/03] 🏆 Our open aligned model, [Llama-3-8B-Magpie-Align-v0.1](https://huggingface.co/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1) is out! It is 🏆 the **best <30B Model** in [AI2 WildBench Leaderboard](https://huggingface.co/spaces/allenai/WildBench)! Even better than the official [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) model!
 29 | - [2024/06/24] Magpie Phi 3 dataset is out! [1M](https://huggingface.co/collections/Magpie-Align/magpie-phi3-667a7a45f1a406cd61685d64) from Phi 3 Medium.
 30 | - [2024/06/12] Magpie Llama-3 dataset is out! [1M](https://huggingface.co/collections/Magpie-Align/magpie-pro-6666b0e713e5f5c09554876f) from Llama-3 70B and [3M](https://huggingface.co/collections/Magpie-Align/magpie-air-6666b11a32021655a27f86c0) from Llama-3 8B.
 31 | - [2024/06/12] [Magpie technical report]((https://arxiv.org/abs/2406.08464)) is out! Let's make high-quality alignment data open for all!
 32 | 
 33 | ## Magpie Supports
 34 | 
 35 | Currently, Magpie has been tested on the **Llama-3**, **Qwen2**, **Phi 3** and **Gemma-2** series. Please [submit an issue](https://github.com/magpie-align/magpie/issues/new) for more model support.
 36 | 
 37 | |Model Family | Magpie | Magpie Scripts | Datasets | Size |
 38 | |-------------|:------:|:-------|:-------|:-------|
 39 | | [Llama 3.3](https://huggingface.co/collections/meta-llama/llama-33-67531d5c405ec5d08a852000)     | ✅ | [70B](scripts/magpie-llama3.3-70b.sh) | [70B](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.3-Pro-1M-v0.1) | 1M |
 40 | | [Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)     | ✅ * | [8B](scripts/magpie-llama3.1-8b.sh),[70B](scripts/magpie-llama3.1-70b.sh) | [70B](https://huggingface.co/collections/Magpie-Align/magpie-llama31-datasets-66a45ed727be07f53c8ff294),[405B(Argilla)](https://huggingface.co/datasets/argilla/magpie-ultra-v0.1) | 1M |
 41 | | [Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)     | ✅ | [8B](scripts/magpie-llama3-8b.sh),[70B](scripts/magpie-llama3-70b.sh) | [8B](https://huggingface.co/collections/Magpie-Align/magpie-air-6666b11a32021655a27f86c0),[70B](https://huggingface.co/collections/Magpie-Align/magpie-pro-6666b0e713e5f5c09554876f) | 3M + 1M |
 42 | | [Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)     | ✅ | [3B](scripts/magpie-qwen2.5-3b.sh),[7B](scripts/magpie-qwen2.5-7b.sh),[14B](scripts/magpie-qwen2.5-14b.sh),[32B](scripts/magpie-qwen2.5-32b.sh),[72B](scripts/magpie-qwen2.5-72b.sh) | [72B](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2.5-Pro-1M-v0.1) | 1M | 
 43 | | [Qwen2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f)     | ✅ | [7B](scripts/magpie-qwen2-7b.sh),[72B](scripts/magpie-qwen2-72b.sh),[Math 7B](scripts/magpie-qwen2-math-7b.sh) | [7B](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Air-3M-v0.1),[72B](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-1M-v0.1) | 3M + 1M |
 44 | | [Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)     | ✅ | [mini](scripts/magpie-phi3mini.sh),[small](scripts/magpie-phi3small.sh),[medium](scripts/magpie-phi3medium.sh) | [medium](https://huggingface.co/collections/Magpie-Align/magpie-phi3-667a7a45f1a406cd61685d64) | 1M |
 45 | | [Gemma-2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)    | ✅ ** | [9B](magpie-gemma2-9b.sh),[27B](scripts/magpie-gemma2-27b.sh) | [27B](https://huggingface.co/collections/Magpie-Align/magpie-gemma2-datasets-669da6aff21b09fdcecbd6ea) | 534K |
 46 | | [Gemma-1.1](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b)    | ⭕️ | [7B](scripts/magpie-gemma7b.sh)
 47 | | [Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)   | ⭕️ | [7B](scripts/magpie-llama2-7b.sh),[70B](scripts/magpie-llama2-70b.sh)
 48 | | [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/)   | ⭕️ | [7B](scripts/magpie-vicuna-7b.sh)
 49 | | [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)   | ⭕️ | [7B](scripts/magpie-mistral7b.sh)
 50 | | [Yi](https://huggingface.co/collections/01-ai/yi-15-2024-05-663f3ecab5f815a3eaca7ca8)    | ⭕️ | [34B](scripts/magpie-yi34b.sh)
 51 | | [DeepSeek Coder](https://huggingface.co/collections/deepseek-ai/deepseekcoder-v2-666bf4b274a5f556827ceeca) | ⭕️ | [Coder V2 Lite](https://github.com/magpie-align/magpie/blob/main/scripts/magpie-deepseek-coderv2-lite.sh)  
 52 | 
 53 | - ✅: It works great! (**\*** Apply a logits processor to surpress markdown; **\*\*** Apply a [filter](exp/str_utils.py) before generating responses.)
 54 | - ⭕️: It works! We can get something interesting, but we may need to design an additional logit processor and/or a filter.
 55 | - ❌: Not work.
 56 | - ❓: Untested.
 57 | 
 58 | The navigation of all available Magpie datasets can be found [here](navigation.md).
 59 | 
 60 | We hope Magpie can contribute to the democratization of AI with enhanced transparency of model alignment processes!
 61 | 
 62 | ## Abstract
 63 | <details><summary>Click Here</summary>
 64 | High-quality instruction data is critical for aligning large language models (LLMs). Although some models, such as Llama-3-Instruct, have open weights, their alignment data remain private, which hinders the democratization of AI. High human labor costs and a limited, predefined scope for prompting prevent existing open-source data creation methods from scaling effectively, potentially limiting the diversity and quality of public alignment datasets. Is it possible to synthesize high-quality instruction data at scale by extracting it directly from an aligned LLM? We present a self-synthesis method for generating large-scale alignment data named Magpie. Our key observation is that aligned LLMs like Llama-3-Instruct can generate a user query when we input only the left-side templates up to the position reserved for user messages, thanks to their auto-regressive nature. We use this method to prompt Llama-3-Instruct and generate 4 million instructions along with their corresponding responses. We perform a comprehensive analysis of the extracted data and select 300K high-quality instances. To compare Magpie data with other public instruction datasets, we fine-tune Llama-3-8B-Base with each dataset and evaluate the performance of the fine-tuned models. Our results indicate that in some tasks, models fine-tuned with Magpie perform comparably to the official Llama-3-8B-Instruct, despite the latter being enhanced with 10 million data points through supervised fine-tuning (SFT) and subsequent feedback learning. We also show that using Magpie solely for SFT can surpass the performance of previous public datasets utilized for both SFT and preference optimization, such as direct preference optimization with UltraFeedback. This advantage is evident on alignment benchmarks such as AlpacaEval, ArenaHard, and WildBench.
 65 | </details><be>
 66 | 
 67 | ## Overview
 68 | 
 69 | ![Overview](figs/overview.png)
 70 | 
 71 | ## Installation
 72 | 
 73 | **Build environment**
 74 | ```
 75 | git clone https://github.com/magpie-align/magpie.git
 76 | cd magpie
 77 | conda create -n magpie python=3.10 -y
 78 | conda activate magpie
 79 | pip install -r requirements.txt
 80 | ```
 81 | 
 82 | **Get access to Llama-3 models from 🤗 Huggingface**
 83 | 
 84 | You can apply for Llama-3 model access [here](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct). To login in the terminal, enter:
 85 | ```
 86 | huggingface-cli login
 87 | ```
 88 | then enter your Huggingface private key beginning with "hf_".
 89 | 
 90 | ## Toy Example
 91 | 
 92 | **Play with Jupyter Notebook**
 93 | 
 94 | The toy example can be found in [`demo.ipynb`](demo.ipynb). Have fun! 
 95 | 
 96 | <a target="_blank" href="https://colab.research.google.com/github/magpie-align/magpie/blob/main/demo.ipynb">
 97 |   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 98 | </a>
 99 | 
100 | ## Batched SFT Data Generation
101 | We use Llama-3-8B-Instruct as an example to demonstrate the batched SFT data generation process. To run batched generation, you can simply run:
102 | ```
103 | cd scripts
104 | bash magpie.sh
105 | ```
106 | The script will generate both instructions and responses in the data folder. It has been tested on an RTX 4090 24G GPU. If you are using GPUs with less memory, consider implementing [quantization](https://docs.vllm.ai/en/latest/quantization/fp8.html).
107 | 
108 | We also provide scripts for other models in the [`scripts`](scripts) folder. You can use [this](#magpie-supports) navigation to find specific Magpie scripts. Note that for model sizes greater than 8B, you may need 4*A100 GPUs to run the scripts.
109 | 
110 | ### Batched Multi-turn Data Generation \[Optional\]
111 | After generating instruction-response pairs, you can extend them to multi-turn conversations. To do so, simply run the following command:
112 | ```
113 | bash magpie-multi-turn.sh ***_ins_res.json
114 | ```
115 | where `***_ins_res.json` is the single-turn instruction-response pairs generated in the previous step.
116 | 
117 | ## Dataset Filtering
118 | ### 1. Tagging
119 | To tag the generated instruction-response pairs, you can run:
120 | ```
121 | cd scripts
122 | bash unitag.sh ***_ins_res.json all
123 | ```
124 | This script will automatically generate quality, difficulty, task category, safety, reward, and language for the generated dataset. You can also generate one tag at a time. For example, if you just want to generate the safety label using device 0, you can run:
125 | ```
126 | cd scripts
127 | bash unitag.sh ***_ins_res.json safety 0
128 | ```
129 | ### 2. Data Concatenation and Converting
130 | You may generate datasets with different generation configurations. We provide a Jupyter notebook [here](data_sft/data_concatenation.ipynb) for concatenating all datasets and converting them to ShareGPT format, which is fully supported by [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) for fine-tuning.
131 | 
132 | ### 3. Removing Repetition
133 | Once you have a full dataset converted to ShareGPT format, you can calculate the minimum neighbor distance of each instruction and remove repetitions. To do so, run:
134 | ```
135 | cd exp
136 | python gen_dis.py --input_file ***_sharegpt.jsonl
137 | ```
138 | where `***_sharegpt.jsonl` is the dataset path obtained in the previous step. The Python script will take care of building the FAISS index and calculating the minimum distance. 
139 | 
140 | ### 4. Design and Apply Your Filter
141 | We provide a Jupyter notebook [here](data_sft/data_filter.ipynb) for simple filtering. You can adjust the filtering parameters to design and apply your own filter based on your needs.
142 | 
143 | ## Preference Data Generation
144 | 
145 | To generate preference data, first prepare filtered instructions following the steps outlined above. For the expected format, please refer to our example [here](data_po/example_instructions.jsonl).
146 | 
147 | Next, please use our provided scripts [here](scripts/magpie_example_po.sh) to generate multiple responses and compute their corresponding rewards. Finally, your can process the data and upload it to Huggingface using [this Jupyter notebook](data_po/process_po.ipynb).
148 | 
149 | ## Fine-tuning
150 | Please take a look at the [recipes](recipes/) directory for instructions and our Magpie model recipes.
151 | 
152 | ## Citation
153 | 
154 | If you find the model, data, or code useful, please cite our paper 🤩:
155 | ```
156 | @article{xu2024magpie,
157 |   title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},
158 |   author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},
159 |   journal={ArXiv},
160 |   year={2024},
161 |   volume={abs/2406.08464},
162 |   url={https://api.semanticscholar.org/CorpusID:270391432}
163 | }
164 | ```
165 | 
166 | ## Star History
167 | 
168 | [![Star History Chart](https://api.star-history.com/svg?repos=magpie-align/magpie&type=Date)](https://star-history.com/#magpie-align/magpie&Date)
169 | 


--------------------------------------------------------------------------------