├── .gitattributes ├── .github └── workflows │ └── cpu_ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── assets ├── .gitkeep ├── cli.png └── flow.png ├── examples ├── evaluate.ipynb └── scoring.ipynb ├── format.sh ├── pyproject.toml ├── recipes ├── sky-t1-7b │ └── README.md ├── sky-t1-flash │ └── README.md └── sky-t1-preview │ ├── README.md │ ├── __init__.py │ ├── postprocess.py │ ├── preprocess.py │ ├── prompts.py │ └── recipe.py ├── scripts ├── __init__.py ├── combine_data.py ├── convert_format.py ├── convert_to_data.py ├── label_math_difficulty.py ├── prompts.py ├── qwen_eval_bon.py ├── response_rewrite.py └── upload_hub.py ├── skythought ├── __init__.py ├── evals │ ├── README.md │ ├── __init__.py │ ├── base_instruct_evals.md │ ├── batch │ │ ├── __init__.py │ │ ├── engines │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── initializer.py │ │ │ └── vllm_engine.py │ │ ├── env_config.py │ │ ├── logging │ │ │ └── __init__.py │ │ ├── pipeline.py │ │ ├── tokenizer.py │ │ ├── utils.py │ │ └── workload.py │ ├── cli.py │ ├── common │ │ ├── __init__.py │ │ └── entities.py │ ├── inference_and_check.py │ ├── labeled_numina_difficulty │ │ └── README.md │ ├── models │ │ ├── __init__.py │ │ ├── base.py │ │ ├── model_configs.yaml │ │ └── system_prompts │ │ │ └── prime.txt │ ├── ray_configs │ │ └── ray_config.yaml │ ├── scoring │ │ ├── __init__.py │ │ ├── apps │ │ │ ├── __init__.py │ │ │ ├── apps_scorer.py │ │ │ └── apps_util.py │ │ ├── base.py │ │ ├── gsm8k │ │ │ ├── __init__.py │ │ │ └── gsm8k_scorer.py │ │ ├── ifeval │ │ │ ├── __init__.py │ │ │ ├── ifeval_scorer.py │ │ │ ├── instructions.py │ │ │ ├── instructions_main.py │ │ │ ├── instructions_registry.py │ │ │ └── instructions_util.py │ │ ├── livecodebench │ │ │ ├── __init__.py │ │ │ ├── livecodebench_scorer.py │ │ │ └── livecodebench_util.py │ │ ├── math │ │ │ ├── __init__.py │ │ │ └── math_scorer.py │ │ ├── taco │ │ │ ├── __init__.py │ │ │ ├── taco_scorer.py │ │ │ └── taco_util.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── pyext2.py │ ├── tasks │ │ ├── __init__.py │ │ ├── aime │ │ │ ├── aime24.yaml │ │ │ ├── aime24_sky.yaml │ │ │ ├── aime25_1.yaml │ │ │ ├── aime25_2.yaml │ │ │ └── aime_handler.py │ │ ├── amc23 │ │ │ ├── amc23.yaml │ │ │ └── amc23_handler.py │ │ ├── apps │ │ │ ├── apps.yaml │ │ │ ├── apps_handler.py │ │ │ └── apps_util.py │ │ ├── arc │ │ │ ├── arc_c.yaml │ │ │ └── arc_handler.py │ │ ├── base.py │ │ ├── gpqa_diamond │ │ │ ├── gpqa_diamond.yaml │ │ │ └── gpqa_diamond_handler.py │ │ ├── gsm8k │ │ │ ├── gsm8k.yaml │ │ │ └── gsm8k_handler.py │ │ ├── liveaops │ │ │ ├── liveaops.yaml │ │ │ └── liveaops_handler.py │ │ ├── livecodebench │ │ │ ├── livecodebench.yaml │ │ │ ├── livecodebench_easy.yaml │ │ │ ├── livecodebench_handler.py │ │ │ ├── livecodebench_hard.yaml │ │ │ ├── livecodebench_medium.yaml │ │ │ └── livecodebench_util.py │ │ ├── math │ │ │ ├── math500.yaml │ │ │ └── math_handler.py │ │ ├── minervamath │ │ │ ├── minervamath.yaml │ │ │ └── minervamath_handler.py │ │ ├── mmlu │ │ │ ├── mmlu.yaml │ │ │ ├── mmlu_handler.py │ │ │ └── mmlu_pro.yaml │ │ ├── numina │ │ │ ├── numina.yaml │ │ │ ├── numina_amc_aime.yaml │ │ │ ├── numina_handler.py │ │ │ ├── numina_math.yaml │ │ │ └── numina_olympiads.yaml │ │ ├── olympiadbench │ │ │ ├── olympiadbench_handler.py │ │ │ └── olympiadbench_math_en.yaml │ │ ├── omni_math │ │ │ ├── omni_handler.py │ │ │ └── omni_math.yaml │ │ ├── taco │ │ │ ├── pyext2.py │ │ │ ├── taco.yaml │ │ │ ├── taco_handler.py │ │ │ └── taco_util.py │ │ └── task_util.py │ └── util │ │ ├── __init__.py │ │ ├── cli_util.py │ │ ├── common.py │ │ ├── math_parsing_util.py │ │ ├── metrics.py │ │ ├── response.py │ │ └── results.py ├── skythought-rl │ ├── .readthedocs.yaml │ ├── .style.yapf │ ├── LICENSE │ ├── Notice.txt │ ├── README.md │ ├── data │ │ ├── data_prepare_mini.py │ │ ├── data_prepare_step2.py │ │ ├── data_prepare_step4.py │ │ └── data_prepare_zero.py │ ├── docker │ │ ├── Dockerfile.ngc.vllm │ │ └── Dockerfile.vemlp.vllm.te │ ├── docs │ │ ├── Makefile │ │ ├── README.md │ │ ├── _static │ │ │ └── logo.png │ │ ├── advance │ │ │ ├── dpo_extension.rst │ │ │ ├── fsdp_extension.rst │ │ │ ├── megatron_extension.rst │ │ │ └── placement.rst │ │ ├── conf.py │ │ ├── examples │ │ │ ├── config.rst │ │ │ ├── gsm8k_example.rst │ │ │ └── ppo_code_architecture.rst │ │ ├── experiment │ │ │ └── ppo.rst │ │ ├── index.rst │ │ ├── preparation │ │ │ ├── prepare_data.rst │ │ │ └── reward_function.rst │ │ ├── requirements-docs.txt │ │ ├── start │ │ │ ├── install.rst │ │ │ └── quickstart.rst │ │ └── workers │ │ │ ├── fsdp_workers.rst │ │ │ ├── megatron_workers.rst │ │ │ └── ray_trainer.rst │ ├── examples │ │ ├── data_preprocess │ │ │ ├── full_hh_rlhf.py │ │ │ ├── gsm8k.py │ │ │ ├── hellaswag.py │ │ │ ├── math_dataset.py │ │ │ └── taco.py │ │ ├── generation │ │ │ └── run_deepseek_v2_lite_math.sh │ │ ├── ppo_trainer │ │ │ ├── run_deepseek7b_llm.sh │ │ │ ├── run_deepseek_full_hh_rlhf.sh │ │ │ ├── run_deepseek_math_gsm8k_megatron.sh │ │ │ ├── run_deepseek_megatron.sh │ │ │ ├── run_gemma.sh │ │ │ ├── run_prime-7b.sh │ │ │ ├── run_qwen2-7b.sh │ │ │ ├── run_qwen2-7b_rm.sh │ │ │ └── run_qwen2.5-32b.sh │ │ ├── ray │ │ │ └── tutorial.ipynb │ │ ├── sft │ │ │ └── gsm8k │ │ │ │ ├── run_deepseek_6b7.sh │ │ │ │ ├── run_gemma_2b.sh │ │ │ │ └── run_gemma_7b.sh │ │ ├── sky-t1 │ │ │ ├── run-sky-t1-7b-step2.sh │ │ │ ├── run-sky-t1-7b-step4.sh │ │ │ ├── run-sky-t1-7b-zero.sh │ │ │ └── run-sky-t1-mini.sh │ │ └── split_placement │ │ │ ├── README.md │ │ │ ├── config │ │ │ └── ppo_trainer_split.yaml │ │ │ ├── main_ppo_split.py │ │ │ ├── run_deepseek7b_llm.sh │ │ │ └── split_monkey_patch.py │ ├── patches │ │ └── megatron_v4.patch │ ├── pyproject.toml │ ├── requirements.txt │ ├── setup.py │ ├── tests │ │ ├── ray │ │ │ ├── check_worker_alive │ │ │ │ └── main.py │ │ │ ├── detached_worker │ │ │ │ ├── README.md │ │ │ │ ├── client.py │ │ │ │ ├── run.sh │ │ │ │ └── server.py │ │ │ ├── test_check_worker_alive.py │ │ │ ├── test_colocated_workers.py │ │ │ ├── test_data_transfer.py │ │ │ ├── test_driverfunc_to_worker.py │ │ │ ├── test_high_level_scheduling_api.py │ │ │ ├── test_ray_local_envs.py │ │ │ ├── test_remote_api.py │ │ │ ├── test_rvdz.py │ │ │ ├── test_worker_group_basics.py │ │ │ └── test_worker_group_torch.py │ │ └── verl │ │ │ └── utils │ │ │ └── dataset │ │ │ ├── test_rl_dataset.py │ │ │ ├── test_rm_dataset.py │ │ │ └── test_sft_dataset.py │ └── verl │ │ ├── __init__.py │ │ ├── models │ │ ├── README.md │ │ ├── __init__.py │ │ ├── llama │ │ │ ├── __init__.py │ │ │ └── megatron │ │ │ │ ├── __init__.py │ │ │ │ ├── checkpoint_utils │ │ │ │ ├── __init__.py │ │ │ │ ├── llama_loader.py │ │ │ │ └── llama_saver.py │ │ │ │ ├── layers │ │ │ │ ├── __init__.py │ │ │ │ ├── parallel_attention.py │ │ │ │ ├── parallel_decoder.py │ │ │ │ ├── parallel_linear.py │ │ │ │ ├── parallel_mlp.py │ │ │ │ └── parallel_rmsnorm.py │ │ │ │ └── modeling_llama_megatron.py │ │ ├── registry.py │ │ ├── transformers │ │ │ ├── __init__.py │ │ │ ├── llama.py │ │ │ ├── monkey_patch.py │ │ │ └── qwen2.py │ │ └── weight_loader_registry.py │ │ ├── protocol.py │ │ ├── single_controller │ │ ├── __init__.py │ │ ├── base │ │ │ ├── __init__.py │ │ │ ├── decorator.py │ │ │ ├── megatron │ │ │ │ ├── __init__.py │ │ │ │ ├── worker.py │ │ │ │ └── worker_group.py │ │ │ ├── register_center │ │ │ │ ├── __init__.py │ │ │ │ └── ray.py │ │ │ ├── worker.py │ │ │ └── worker_group.py │ │ └── ray │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── megatron.py │ │ ├── third_party │ │ ├── __init__.py │ │ └── vllm │ │ │ ├── __init__.py │ │ │ ├── vllm_v_0_3_1 │ │ │ ├── __init__.py │ │ │ ├── arg_utils.py │ │ │ ├── config.py │ │ │ ├── llm.py │ │ │ ├── llm_engine_sp.py │ │ │ ├── model_loader.py │ │ │ ├── model_runner.py │ │ │ ├── parallel_state.py │ │ │ ├── tokenizer.py │ │ │ ├── weight_loaders.py │ │ │ └── worker.py │ │ │ ├── vllm_v_0_4_2 │ │ │ ├── __init__.py │ │ │ ├── arg_utils.py │ │ │ ├── config.py │ │ │ ├── dtensor_weight_loaders.py │ │ │ ├── hf_weight_loader.py │ │ │ ├── llm.py │ │ │ ├── llm_engine_sp.py │ │ │ ├── megatron_weight_loaders.py │ │ │ ├── model_loader.py │ │ │ ├── model_runner.py │ │ │ ├── parallel_state.py │ │ │ ├── spmd_gpu_executor.py │ │ │ ├── tokenizer.py │ │ │ └── worker.py │ │ │ ├── vllm_v_0_5_4 │ │ │ ├── __init__.py │ │ │ ├── arg_utils.py │ │ │ ├── config.py │ │ │ ├── dtensor_weight_loaders.py │ │ │ ├── hf_weight_loader.py │ │ │ ├── llm.py │ │ │ ├── llm_engine_sp.py │ │ │ ├── megatron_weight_loaders.py │ │ │ ├── model_loader.py │ │ │ ├── model_runner.py │ │ │ ├── parallel_state.py │ │ │ ├── spmd_gpu_executor.py │ │ │ ├── tokenizer.py │ │ │ └── worker.py │ │ │ └── vllm_v_0_6_3 │ │ │ ├── __init__.py │ │ │ ├── arg_utils.py │ │ │ ├── config.py │ │ │ ├── dtensor_weight_loaders.py │ │ │ ├── hf_weight_loader.py │ │ │ ├── llm.py │ │ │ ├── llm_engine_sp.py │ │ │ ├── megatron_weight_loaders.py │ │ │ ├── model_loader.py │ │ │ ├── model_runner.py │ │ │ ├── parallel_state.py │ │ │ ├── spmd_gpu_executor.py │ │ │ ├── tokenizer.py │ │ │ └── worker.py │ │ ├── trainer │ │ ├── __init__.py │ │ ├── config │ │ │ ├── evaluation.yaml │ │ │ ├── generation.yaml │ │ │ ├── ppo_megatron_trainer.yaml │ │ │ ├── ppo_trainer.yaml │ │ │ └── sft_trainer.yaml │ │ ├── fsdp_sft_trainer.py │ │ ├── main_eval.py │ │ ├── main_generation.py │ │ ├── main_ppo.py │ │ ├── main_ppo_sky.py │ │ ├── ppo │ │ │ ├── __init__.py │ │ │ ├── core_algos.py │ │ │ └── ray_trainer.py │ │ └── runtime_env.yaml │ │ ├── utils │ │ ├── __init__.py │ │ ├── config.py │ │ ├── dataset │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── rl_dataset.py │ │ │ ├── rm_dataset.py │ │ │ └── sft_dataset.py │ │ ├── debug │ │ │ ├── __init__.py │ │ │ ├── performance.py │ │ │ └── trajectory_tracker.py │ │ ├── distributed.py │ │ ├── flops_counter.py │ │ ├── fs.py │ │ ├── fsdp_utils.py │ │ ├── hdfs_io.py │ │ ├── import_utils.py │ │ ├── logger │ │ │ ├── __init__.py │ │ │ └── aggregate_logger.py │ │ ├── logging_utils.py │ │ ├── megatron │ │ │ ├── __init__.py │ │ │ ├── memory.py │ │ │ ├── optimizer.py │ │ │ ├── optimizer_config.py │ │ │ ├── pipeline_parallel.py │ │ │ ├── sequence_parallel.py │ │ │ └── tensor_parallel.py │ │ ├── megatron_utils.py │ │ ├── memory_buffer.py │ │ ├── model.py │ │ ├── py_functional.py │ │ ├── ray_utils.py │ │ ├── rendezvous │ │ │ ├── __init__.py │ │ │ └── ray_backend.py │ │ ├── reward_score │ │ │ ├── __init__.py │ │ │ ├── evaluation_utils │ │ │ │ ├── code_util │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── testing_util.py │ │ │ │ │ └── utils.py │ │ │ │ └── math_util │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── grader.py │ │ │ │ │ ├── math_normalize.py │ │ │ │ │ └── testing_utlis.py │ │ │ ├── gsm8k.py │ │ │ ├── gt_verifier.py │ │ │ └── math.py │ │ ├── seqlen_balancing.py │ │ ├── tokenizer.py │ │ ├── torch_dtypes.py │ │ ├── torch_functional.py │ │ ├── tracking.py │ │ └── ulysses.py │ │ ├── version │ │ └── version │ │ └── workers │ │ ├── __init__.py │ │ ├── actor │ │ ├── __init__.py │ │ ├── base.py │ │ ├── dp_actor.py │ │ └── megatron_actor.py │ │ ├── critic │ │ ├── __init__.py │ │ ├── base.py │ │ ├── dp_critic.py │ │ └── megatron_critic.py │ │ ├── fsdp_workers.py │ │ ├── megatron_workers.py │ │ ├── reward_model │ │ ├── __init__.py │ │ ├── base.py │ │ └── megatron │ │ │ ├── __init__.py │ │ │ └── reward_model.py │ │ ├── rollout │ │ ├── __init__.py │ │ ├── base.py │ │ ├── hf_rollout.py │ │ ├── naive │ │ │ ├── __init__.py │ │ │ └── naive_rollout.py │ │ ├── tokenizer.py │ │ └── vllm_rollout │ │ │ ├── __init__.py │ │ │ └── vllm_rollout.py │ │ └── sharding_manager │ │ ├── __init__.py │ │ ├── base.py │ │ ├── fsdp_ulysses.py │ │ ├── fsdp_vllm.py │ │ └── megatron_vllm.py ├── test-time-scaling │ ├── README.md │ ├── assets │ │ └── figure1.png │ ├── codecontest_evaluate_multiprocess.py │ ├── evaluate_multiprocess.py │ ├── live_code_bench_execute.py │ ├── live_code_bench_program.py │ ├── pattern_icl_map.json │ ├── scripts │ │ ├── baselines │ │ │ ├── 4o_mini.sh │ │ │ ├── 4o_mini_cct.sh │ │ │ ├── o1_mini.sh │ │ │ ├── o1_preview.sh │ │ │ ├── o3_mini.sh │ │ │ ├── qwen0.5b.sh │ │ │ ├── qwen1.5b.sh │ │ │ ├── qwen14b.sh │ │ │ ├── qwen32b.sh │ │ │ ├── qwen3b.sh │ │ │ ├── qwen7b.sh │ │ │ ├── qwq32b.sh │ │ │ ├── r1qwen14b.sh │ │ │ ├── r1qwen32b.sh │ │ │ └── r1qwen7b.sh │ │ ├── baselines_selfdebug │ │ │ ├── 4o_mini_n_1_debug_public3_random.sh │ │ │ ├── o1_mini_n_1_debug_public3_random.sh │ │ │ ├── qwen0.5b_n_1_debug_public3_random.sh │ │ │ ├── qwen1.5b_n_1_debug_public3_random.sh │ │ │ ├── qwen14b_n_1_debug_public3_random.sh │ │ │ ├── qwen32b_n_1_debug_public3_random.sh │ │ │ ├── qwen3b_n_1_debug_public3_random.sh │ │ │ ├── qwen7b_n_1_debug_public3_random.sh │ │ │ ├── qwq32b_n_1_debug_public3_random.sh │ │ │ ├── r1qwen14b_n_1_debug_public3_random.sh │ │ │ ├── r1qwen32b_n_1_debug_public3_random.sh │ │ │ └── r1qwen7b_n_1_debug_public3_random.sh │ │ ├── final_first_cached │ │ │ ├── 4omini_n_16_debug_public3_select_first_cached.sh │ │ │ ├── batch_small_models_first.sh │ │ │ ├── gh200_1_batch.sh │ │ │ ├── gh200_2_batch.sh │ │ │ ├── o1mini_n_16_debug_public3_select_first_cached.sh │ │ │ ├── qwen0.5b_n_16_debug_public3_select_first_cached.sh │ │ │ ├── qwen1.5b_n_16_debug_public3_select_first_cached.sh │ │ │ ├── qwen14b_n_16_debug_public3_select_first_cached.sh │ │ │ ├── qwen32b_n_16_debug_public3_select_first_cached.sh │ │ │ ├── qwen3b_n_16_debug_public3_select_first_cached.sh │ │ │ ├── qwen7b_n_16_debug_public3_select_first_cached.sh │ │ │ ├── qwq32b_n_16_debug_public3_select_first_cached.sh │ │ │ ├── r1qwen14b_n_16_debug_public3_select_first_cached.sh │ │ │ ├── r1qwen32b_n_16_debug_public3_select_first_cached.sh │ │ │ └── r1qwen7b_n_16_debug_public3_select_first_cached.sh │ │ ├── final_gentest_notimeout_cached │ │ │ ├── 4omini_n_16_debug_public3_select_4omini_cached.sh │ │ │ ├── batch_small_models_gentest.sh │ │ │ ├── o1mini_n_16_debug_public3_select_4omini_cached.sh │ │ │ ├── qwen0.5b_n_16_debug_public3_select_4omini_cached.sh │ │ │ ├── qwen1.5b_n_16_debug_public3_select_4omini_cached.sh │ │ │ ├── qwen14b_n_16_debug_public3_select_4omini_cached.sh │ │ │ ├── qwen32b_n_16_debug_public3_select_4omini_cached.sh │ │ │ ├── qwen3b_n_16_debug_public3_select_4omini_cached.sh │ │ │ ├── qwen7b_n_16_debug_public3_select_4omini_cached.sh │ │ │ ├── qwq32b_n_16_debug_public3_select_4omini_cached.sh │ │ │ ├── r1qwen14b_n_16_debug_public3_select_4omini_cached.sh │ │ │ ├── r1qwen32b_n_16_debug_public3_select_4omini_cached.sh │ │ │ └── r1qwen7b_n_16_debug_public3_select_4omini_cached.sh │ │ ├── final_oracle │ │ │ ├── 4omini_n_16_debug_public3_select_oracle.sh │ │ │ ├── o1mini_n_16_debug_public3_select_oracle.sh │ │ │ ├── qwen0.5b_n_16_debug_public3_select_oracle.sh │ │ │ ├── qwen1.5b_n_32_debug_public3_select_oracle.sh │ │ │ ├── qwen14b_n_16_debug_public3_select_oracle.sh │ │ │ ├── qwen32b_n_16_debug_public3_select_oracle.sh │ │ │ ├── qwen32b_n_16_debug_public3_select_oracle_icl_patterns.sh │ │ │ ├── qwen3b_n_32_debug_public3_select_oracle.sh │ │ │ ├── qwen7b_n_16_debug_public3_select_oracle.sh │ │ │ ├── qwq32b_n_16_debug_public3_select_oracle.sh │ │ │ ├── r1qwen14b_n_16_debug_public3_select_oracle.sh │ │ │ ├── r1qwen32b_n_16_debug_public3_select_oracle.sh │ │ │ └── r1qwen7b_n_16_debug_public3_select_oracle.sh │ │ ├── final_random_cached │ │ │ ├── 4omini_n_16_debug_public3_select_random_cached.sh │ │ │ ├── batch_small_models_random.sh │ │ │ ├── o1mini_n_16_debug_public3_select_random_cached.sh │ │ │ ├── qwen0.5b_n_16_debug_public3_select_random_cached.sh │ │ │ ├── qwen1.5b_n_16_debug_public3_select_random_cached.sh │ │ │ ├── qwen14b_n_16_debug_public3_select_random_cached.sh │ │ │ ├── qwen32b_n_16_debug_public3_select_random_cached.sh │ │ │ ├── qwen3b_n_16_debug_public3_select_random_cached.sh │ │ │ ├── qwen7b_n_16_debug_public3_select_random_cached.sh │ │ │ ├── qwq32b_n_16_debug_public3_select_random_cached.sh │ │ │ ├── r1qwen14b_n_16_debug_public3_select_random_cached.sh │ │ │ ├── r1qwen32b_n_16_debug_public3_select_random_cached.sh │ │ │ └── r1qwen7b_n_16_debug_public3_select_random_cached.sh │ │ ├── final_tool_assisted_cached │ │ │ ├── 4omini_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ ├── batch_small_models_tool_assisted.sh │ │ │ ├── o1mini_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ ├── qwen0.5b_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ ├── qwen1.5b_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ ├── qwen14b_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ ├── qwen32b_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ ├── qwen3b_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ ├── qwen7b_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ ├── qwq32b_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ ├── r1qwen14b_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ ├── r1qwen32b_n_16_debug_public3_select_tool_assisted_cached.sh │ │ │ └── r1qwen7b_n_16_debug_public3_select_tool_assisted_cached.sh │ │ ├── majority_baselines │ │ │ ├── 4o_mini_n_16_majority.sh │ │ │ ├── o1_mini_n_16_majority.sh │ │ │ ├── qwen0.5b_n_16_majority.sh │ │ │ ├── qwen1.5b_n_16_majority.sh │ │ │ ├── qwen14b_n_16_majority.sh │ │ │ ├── qwen32b_n_16_majority.sh │ │ │ ├── qwen3b_n_16_majority.sh │ │ │ ├── qwen7b_n_16_majority.sh │ │ │ ├── qwq32b_n_16_majority.sh │ │ │ ├── r1qwen14b_n_16_majority.sh │ │ │ ├── r1qwen32b_n_16_majority.sh │ │ │ └── r1qwen7b_n_16_majority.sh │ │ ├── sec4_parallel_sample │ │ │ ├── temp02_4o_mini.sh │ │ │ ├── temp02_qwen7b.sh │ │ │ ├── temp02_qwen_32b.sh │ │ │ ├── temp05_4o_mini.sh │ │ │ ├── temp05_qwen7b.sh │ │ │ ├── temp05_qwen_32b.sh │ │ │ ├── temp09_4o_mini.sh │ │ │ ├── temp09_qwen7b.sh │ │ │ ├── temp09_qwen_32b.sh │ │ │ ├── vanilla_4o_mini.sh │ │ │ ├── vanilla_qwen_32b.sh │ │ │ ├── vanilla_qwen_7b.sh │ │ │ ├── vanilla_qwq_32b.sh │ │ │ ├── vanilla_qwq_32b_hard.sh │ │ │ └── vanilla_qwq_32b_medium.sh │ │ ├── sec5_revision │ │ │ ├── last_4o_mini.sh │ │ │ ├── last_qwen_32b.sh │ │ │ ├── last_qwen_7b.sh │ │ │ ├── last_qwq_32b_with_4o_debug.sh │ │ │ ├── refine_4o_mini.sh │ │ │ ├── refine_qwen_32b.sh │ │ │ ├── refine_qwen_7b.sh │ │ │ ├── vanilla_4o_mini.sh │ │ │ ├── vanilla_qwen_32b.sh │ │ │ ├── vanilla_qwen_7b.sh │ │ │ ├── vanilla_wo_reasoning_qwq_32b_with_4o_debug.sh │ │ │ └── vanilla_wo_reasoning_qwq_self_rewrite.sh │ │ ├── sec6 │ │ │ ├── 4o_mini_tool_assisted.sh │ │ │ ├── 4o_mini_vanilla_baseline.sh │ │ │ ├── 4o_mini_vanilla_with_4omini_generated_and_timeout_test.sh │ │ │ ├── qwen_32b_tool_assisted.sh │ │ │ ├── qwen_32b_vanilla_baseline.sh │ │ │ ├── qwen_32b_with4omini_test_and_timeout_vanilla.sh │ │ │ ├── qwen_7b_tool_assisted.sh │ │ │ ├── qwen_7b_vanilla_baseline.sh │ │ │ ├── qwen_7b_with4omini_test_and_timeout_vanilla.sh │ │ │ ├── qwq_32b_with_4omini_test_and_timeout_wo_reasoning_vanilla_4o_debug.sh │ │ │ ├── qwq_32b_wo_reasoning_vanilla_4o_debug_baseline.sh │ │ │ └── qwq_32b_wo_reasoning_vanilla_4o_debug_tool_assisted.sh │ │ ├── sec6_llm_judge_baseline │ │ │ ├── 4o_mini_llm_judge_baseline.sh │ │ │ ├── qwen_32b_llm_judge_baseline.sh │ │ │ ├── qwen_7b_llm_judge_baseline.sh │ │ │ └── qwq_32b_wo_reasoning_vanilla_4o_debug_llm_judge_baseline.sh │ │ └── sec6_o1_generated │ │ │ ├── 4o_mini_vanilla_with_o1_generated_and_timeout_test.sh │ │ │ ├── qwen_32b_with_o1__test_and_timeout_vanilla.sh │ │ │ ├── qwen_7b_with_o1_test_and_timeout_vanilla.sh │ │ │ └── qwq_32b_with_o1_test_and_timeout_wo_reasoning_vanilla_4o_debug.sh │ └── util.py └── train │ ├── LLaMA-Factory │ ├── .deepspeed_env │ ├── .dockerignore │ ├── .env.local │ ├── .gitattributes │ ├── .github │ │ ├── CODE_OF_CONDUCT.md │ │ ├── CONTRIBUTING.md │ │ ├── ISSUE_TEMPLATE │ │ │ └── bug-report.yml │ │ ├── PULL_REQUEST_TEMPLATE.md │ │ ├── SECURITY.md │ │ └── workflows │ │ │ ├── label_issue.yml │ │ │ ├── publish.yml │ │ │ └── tests.yml │ ├── .gitignore │ ├── .pre-commit-config.yaml │ ├── CITATION.cff │ ├── LICENSE │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── README_zh.md │ ├── assets │ │ ├── benchmark.svg │ │ ├── logo.png │ │ ├── wechat.jpg │ │ └── wechat_npu.jpg │ ├── data │ │ ├── README.md │ │ ├── README_zh.md │ │ ├── alpaca_en_demo.json │ │ ├── alpaca_zh_demo.json │ │ ├── belle_multiturn │ │ │ └── belle_multiturn.py │ │ ├── c4_demo.json │ │ ├── dataset_info.json │ │ ├── dpo_en_demo.json │ │ ├── dpo_zh_demo.json │ │ ├── glaive_toolcall_en_demo.json │ │ ├── glaive_toolcall_zh_demo.json │ │ ├── hh_rlhf_en │ │ │ └── hh_rlhf_en.py │ │ ├── identity.json │ │ ├── kto_en_demo.json │ │ ├── mllm_demo.json │ │ ├── mllm_demo_data │ │ │ ├── 1.jpg │ │ │ ├── 1.mp4 │ │ │ ├── 2.avi │ │ │ ├── 2.jpg │ │ │ ├── 3.jpg │ │ │ └── 3.mp4 │ │ ├── mllm_video_demo.json │ │ ├── ultra_chat │ │ │ └── ultra_chat.py │ │ └── wiki_demo.txt │ ├── docker │ │ ├── docker-cuda │ │ │ ├── Dockerfile │ │ │ └── docker-compose.yml │ │ ├── docker-npu │ │ │ ├── Dockerfile │ │ │ └── docker-compose.yml │ │ └── docker-rocm │ │ │ ├── Dockerfile │ │ │ └── docker-compose.yml │ ├── evaluation │ │ ├── ceval │ │ │ ├── ceval.py │ │ │ ├── ceval.zip │ │ │ └── mapping.json │ │ ├── cmmlu │ │ │ ├── cmmlu.py │ │ │ ├── cmmlu.zip │ │ │ └── mapping.json │ │ └── mmlu │ │ │ ├── mapping.json │ │ │ ├── mmlu.py │ │ │ └── mmlu.zip │ ├── examples │ │ ├── README.md │ │ ├── README_zh.md │ │ ├── accelerate │ │ │ └── fsdp_config.yaml │ │ ├── deepspeed │ │ │ ├── ds_z0_config.json │ │ │ ├── ds_z2_config.json │ │ │ ├── ds_z2_offload_config.json │ │ │ ├── ds_z3_config.json │ │ │ └── ds_z3_offload_config.json │ │ ├── extras │ │ │ ├── adam_mini │ │ │ │ └── qwen2_full_sft.yaml │ │ │ ├── badam │ │ │ │ └── llama3_full_sft.yaml │ │ │ ├── fsdp_qlora │ │ │ │ ├── llama3_lora_sft.yaml │ │ │ │ └── train.sh │ │ │ ├── galore │ │ │ │ └── llama3_full_sft.yaml │ │ │ ├── llama_pro │ │ │ │ ├── expand.sh │ │ │ │ └── llama3_freeze_sft.yaml │ │ │ ├── loraplus │ │ │ │ └── llama3_lora_sft.yaml │ │ │ ├── mod │ │ │ │ └── llama3_full_sft.yaml │ │ │ ├── nlg_eval │ │ │ │ └── llama3_lora_predict.yaml │ │ │ └── pissa │ │ │ │ ├── init.sh │ │ │ │ └── llama3_lora_sft.yaml │ │ ├── inference │ │ │ ├── llama3.yaml │ │ │ ├── llama3_lora_sft.yaml │ │ │ ├── llama3_vllm.yaml │ │ │ ├── llava1_5.yaml │ │ │ └── qwen2_vl.yaml │ │ ├── merge_lora │ │ │ ├── llama3_gptq.yaml │ │ │ ├── llama3_lora_sft.yaml │ │ │ └── qwen2vl_lora_sft.yaml │ │ ├── train_full │ │ │ ├── llama3_full_sft.yaml │ │ │ ├── qwen2_full_sft.yaml │ │ │ ├── qwen2_full_simpo.yaml │ │ │ └── qwen2vl_full_sft.yaml │ │ ├── train_lora │ │ │ ├── llama3_lora_dpo.yaml │ │ │ ├── llama3_lora_eval.yaml │ │ │ ├── llama3_lora_kto.yaml │ │ │ ├── llama3_lora_ppo.yaml │ │ │ ├── llama3_lora_pretrain.yaml │ │ │ ├── llama3_lora_reward.yaml │ │ │ ├── llama3_lora_sft.yaml │ │ │ ├── llama3_lora_sft_ds3.yaml │ │ │ ├── llama3_preprocess.yaml │ │ │ ├── llava1_5_lora_sft.yaml │ │ │ ├── qwen2_lora_config.yaml │ │ │ ├── qwen2vl_lora_dpo.yaml │ │ │ └── qwen2vl_lora_sft.yaml │ │ └── train_qlora │ │ │ ├── llama3_lora_sft_aqlm.yaml │ │ │ ├── llama3_lora_sft_awq.yaml │ │ │ ├── llama3_lora_sft_gptq.yaml │ │ │ └── llama3_lora_sft_otfq.yaml │ ├── launch_3_nodes.sh │ ├── pyproject.toml │ ├── requirements.txt │ ├── scripts │ │ ├── api_example │ │ │ ├── test_image.py │ │ │ └── test_toolcall.py │ │ ├── convert_ckpt │ │ │ ├── llamafy_baichuan2.py │ │ │ └── llamafy_qwen.py │ │ ├── llama_pro.py │ │ ├── loftq_init.py │ │ ├── pissa_init.py │ │ ├── stat_utils │ │ │ ├── cal_flops.py │ │ │ ├── cal_lr.py │ │ │ ├── cal_mfu.py │ │ │ ├── cal_ppl.py │ │ │ └── length_cdf.py │ │ └── vllm_infer.py │ ├── setup.py │ ├── single_node.sh │ ├── src │ │ ├── api.py │ │ ├── llamafactory │ │ │ ├── __init__.py │ │ │ ├── api │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ ├── chat.py │ │ │ │ ├── common.py │ │ │ │ └── protocol.py │ │ │ ├── chat │ │ │ │ ├── __init__.py │ │ │ │ ├── base_engine.py │ │ │ │ ├── chat_model.py │ │ │ │ ├── hf_engine.py │ │ │ │ └── vllm_engine.py │ │ │ ├── cli.py │ │ │ ├── data │ │ │ │ ├── __init__.py │ │ │ │ ├── aligner.py │ │ │ │ ├── collator.py │ │ │ │ ├── data_utils.py │ │ │ │ ├── formatter.py │ │ │ │ ├── loader.py │ │ │ │ ├── mm_plugin.py │ │ │ │ ├── parser.py │ │ │ │ ├── preprocess.py │ │ │ │ ├── processors │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── feedback.py │ │ │ │ │ ├── pairwise.py │ │ │ │ │ ├── pretrain.py │ │ │ │ │ ├── processor_utils.py │ │ │ │ │ ├── supervised.py │ │ │ │ │ └── unsupervised.py │ │ │ │ ├── template.py │ │ │ │ └── tool_utils.py │ │ │ ├── eval │ │ │ │ ├── __init__.py │ │ │ │ ├── evaluator.py │ │ │ │ └── template.py │ │ │ ├── extras │ │ │ │ ├── __init__.py │ │ │ │ ├── constants.py │ │ │ │ ├── env.py │ │ │ │ ├── logging.py │ │ │ │ ├── misc.py │ │ │ │ ├── packages.py │ │ │ │ └── ploting.py │ │ │ ├── hparams │ │ │ │ ├── __init__.py │ │ │ │ ├── data_args.py │ │ │ │ ├── evaluation_args.py │ │ │ │ ├── finetuning_args.py │ │ │ │ ├── generating_args.py │ │ │ │ ├── model_args.py │ │ │ │ └── parser.py │ │ │ ├── launcher.py │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ ├── adapter.py │ │ │ │ ├── loader.py │ │ │ │ ├── model_utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── attention.py │ │ │ │ │ ├── checkpointing.py │ │ │ │ │ ├── embedding.py │ │ │ │ │ ├── liger_kernel.py │ │ │ │ │ ├── longlora.py │ │ │ │ │ ├── misc.py │ │ │ │ │ ├── mod.py │ │ │ │ │ ├── moe.py │ │ │ │ │ ├── packing.py │ │ │ │ │ ├── quantization.py │ │ │ │ │ ├── rope.py │ │ │ │ │ ├── unsloth.py │ │ │ │ │ ├── valuehead.py │ │ │ │ │ └── visual.py │ │ │ │ └── patcher.py │ │ │ ├── train │ │ │ │ ├── __init__.py │ │ │ │ ├── callbacks.py │ │ │ │ ├── dpo │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── trainer.py │ │ │ │ │ └── workflow.py │ │ │ │ ├── kto │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── trainer.py │ │ │ │ │ └── workflow.py │ │ │ │ ├── ppo │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── ppo_utils.py │ │ │ │ │ ├── trainer.py │ │ │ │ │ └── workflow.py │ │ │ │ ├── pt │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── trainer.py │ │ │ │ │ └── workflow.py │ │ │ │ ├── rm │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── metric.py │ │ │ │ │ ├── trainer.py │ │ │ │ │ └── workflow.py │ │ │ │ ├── sft │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── metric.py │ │ │ │ │ ├── trainer.py │ │ │ │ │ └── workflow.py │ │ │ │ ├── test_utils.py │ │ │ │ ├── trainer_utils.py │ │ │ │ └── tuner.py │ │ │ └── webui │ │ │ │ ├── __init__.py │ │ │ │ ├── chatter.py │ │ │ │ ├── common.py │ │ │ │ ├── components │ │ │ │ ├── __init__.py │ │ │ │ ├── chatbot.py │ │ │ │ ├── data.py │ │ │ │ ├── eval.py │ │ │ │ ├── export.py │ │ │ │ ├── infer.py │ │ │ │ ├── top.py │ │ │ │ └── train.py │ │ │ │ ├── css.py │ │ │ │ ├── engine.py │ │ │ │ ├── interface.py │ │ │ │ ├── locales.py │ │ │ │ ├── manager.py │ │ │ │ ├── runner.py │ │ │ │ └── utils.py │ │ ├── train.py │ │ └── webui.py │ ├── tests │ │ ├── data │ │ │ ├── processors │ │ │ │ ├── test_feedback.py │ │ │ │ ├── test_pairwise.py │ │ │ │ ├── test_processor_utils.py │ │ │ │ ├── test_supervised.py │ │ │ │ └── test_unsupervised.py │ │ │ ├── test_collator.py │ │ │ ├── test_formatter.py │ │ │ ├── test_mm_plugin.py │ │ │ └── test_template.py │ │ ├── e2e │ │ │ ├── test_chat.py │ │ │ └── test_train.py │ │ ├── eval │ │ │ └── test_eval_template.py │ │ └── model │ │ │ ├── model_utils │ │ │ ├── test_attention.py │ │ │ ├── test_checkpointing.py │ │ │ └── test_packing.py │ │ │ ├── test_base.py │ │ │ ├── test_freeze.py │ │ │ ├── test_full.py │ │ │ ├── test_lora.py │ │ │ └── test_pissa.py │ ├── train.sh │ └── zero3_config.json │ └── README.md ├── tests ├── __init__.py └── evals │ ├── __init__.py │ ├── scoring │ ├── __init__.py │ ├── apps │ │ └── test_apps.py │ ├── taco │ │ └── test_taco.py │ └── test_base.py │ ├── tasks │ ├── test_aime.py │ ├── test_amc.py │ ├── test_math.py │ ├── test_mmlu.py │ ├── test_mmlu_pro.py │ └── test_preprocessing.py │ ├── test_cli.py │ └── util │ ├── test_cli_util.py │ ├── test_common.py │ └── test_math_parsing.py └── uv.lock /.gitattributes: -------------------------------------------------------------------------------- 1 | data/Sky-T1_data_17k.json filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.9.3 4 | hooks: 5 | - id: ruff 6 | args: [ --fix, --exit-non-zero-on-fix ] 7 | # NOTE (sumanthrh): Many of the files excluded here are used for validating code generation, and linters do not recognize some of the logic in these files. skythought/train is excluded for now because it's a fork of Llamafactory 8 | exclude: (^skythought/train/.*|^skythought/skythought-rl/.*|pyext2\.py|taco_util\.py|apps_util\.py|scripts/prompts\.py|skythought/test-time-scaling/.*)$ 9 | 10 | 11 | # Black needs to be ran after ruff with --fix 12 | - repo: https://github.com/psf/black 13 | rev: 24.10.0 14 | hooks: 15 | - id: black 16 | exclude: (^skythought/train/.*|^skythought/skythought-rl/.*|pyext2\.py|skythought/test-time-scaling/.*)$ 17 | -------------------------------------------------------------------------------- /assets/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/assets/.gitkeep -------------------------------------------------------------------------------- /assets/cli.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/assets/cli.png -------------------------------------------------------------------------------- /assets/flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/assets/flow.png -------------------------------------------------------------------------------- /format.sh: -------------------------------------------------------------------------------- 1 | 2 | set -e 3 | 4 | if command -v uv >/dev/null 2>&1; then 5 | uv pip install -q pre-commit 6 | else 7 | pip install -q pre-commit 8 | fi 9 | 10 | # pre-commit run --all-files always runs from the root directory. we run this only on tools/ for now. 11 | pre-commit run --all-files --config .pre-commit-config.yaml 12 | -------------------------------------------------------------------------------- /recipes/sky-t1-preview/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/recipes/sky-t1-preview/__init__.py -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/scripts/__init__.py -------------------------------------------------------------------------------- /skythought/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/__init__.py -------------------------------------------------------------------------------- /skythought/evals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/evals/__init__.py -------------------------------------------------------------------------------- /skythought/evals/batch/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [] 2 | 3 | from .engines import init_engine_from_config 4 | from .pipeline import Pipeline 5 | from .workload import ( 6 | EvalWorkload, 7 | ) 8 | 9 | __all__ = [ 10 | "Pipeline", 11 | "init_engine_from_config", 12 | "EvalWorkload", 13 | ] 14 | -------------------------------------------------------------------------------- /skythought/evals/batch/engines/__init__.py: -------------------------------------------------------------------------------- 1 | """LLM Engines.""" 2 | 3 | __all__ = [] 4 | 5 | from .initializer import EngineInitializerBase, init_engine_from_config 6 | 7 | __all__ = [ 8 | "EngineInitializerBase", 9 | "init_engine_from_config", 10 | ] 11 | -------------------------------------------------------------------------------- /skythought/evals/batch/engines/base.py: -------------------------------------------------------------------------------- 1 | """Engine base.""" 2 | 3 | from typing import Any, AsyncGenerator, Dict 4 | 5 | import numpy as np 6 | 7 | 8 | class EngineBase: 9 | """Base class for engines.""" 10 | 11 | async def __call__( 12 | self, batch: Dict[str, np.ndarray] 13 | ) -> AsyncGenerator[Dict[str, Any], None]: 14 | """Call the LLM engine asynchronously to process a Ray Data batch. 15 | 16 | Args: 17 | batch: The batch. 18 | 19 | Yields: 20 | The output. 21 | """ 22 | raise NotImplementedError 23 | -------------------------------------------------------------------------------- /skythought/evals/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/evals/common/__init__.py -------------------------------------------------------------------------------- /skythought/evals/labeled_numina_difficulty/README.md: -------------------------------------------------------------------------------- 1 | # Labeled NUMINA Difficulty Data 2 | 3 | We also include data of labeled difficulty from NUMINA, in the following files: `labeled_amc_aime_0_-1.json`, `labeled_math_0_-1.json`, `labeled_olympiads_0_-1.json`. These files can be found and downloaded from [HuggingFace](https://huggingface.co/datasets/NovaSky-AI/labeled_numina_difficulty). -------------------------------------------------------------------------------- /skythought/evals/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import ModelConfig, get_system_prompt_keys 2 | 3 | __all__ = ["ModelConfig", "get_system_prompt_keys"] 4 | -------------------------------------------------------------------------------- /skythought/evals/scoring/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Scorer 2 | from .gsm8k import GSM8KScorer 3 | from .math import MathEqualScorer, MathVerifyScorer 4 | 5 | __all__ = ["Scorer", "MathEqualScorer", "MathVerifyScorer", "GSM8KScorer"] 6 | -------------------------------------------------------------------------------- /skythought/evals/scoring/apps/__init__.py: -------------------------------------------------------------------------------- 1 | from .apps_scorer import APPSScorer 2 | 3 | __all__ = ["APPSScorer"] 4 | -------------------------------------------------------------------------------- /skythought/evals/scoring/gsm8k/__init__.py: -------------------------------------------------------------------------------- 1 | from .gsm8k_scorer import GSM8KScorer 2 | 3 | __all__ = ["GSM8KScorer"] 4 | -------------------------------------------------------------------------------- /skythought/evals/scoring/ifeval/__init__.py: -------------------------------------------------------------------------------- 1 | from .ifeval_scorer import IfEvalScorer 2 | 3 | __all__ = ["IfEvalScorer"] 4 | -------------------------------------------------------------------------------- /skythought/evals/scoring/livecodebench/__init__.py: -------------------------------------------------------------------------------- 1 | from .livecodebench_scorer import LiveCodeBenchScorer 2 | 3 | __all__ = ["LiveCodeBenchScorer"] 4 | -------------------------------------------------------------------------------- /skythought/evals/scoring/math/__init__.py: -------------------------------------------------------------------------------- 1 | from .math_scorer import MathEqualScorer, MathVerifyScorer 2 | 3 | __all__ = ["MathVerifyScorer", "MathEqualScorer"] 4 | -------------------------------------------------------------------------------- /skythought/evals/scoring/taco/__init__.py: -------------------------------------------------------------------------------- 1 | from .taco_scorer import TACOScorer 2 | 3 | __all__ = ["TACOScorer"] 4 | -------------------------------------------------------------------------------- /skythought/evals/scoring/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/evals/scoring/utils/__init__.py -------------------------------------------------------------------------------- /skythought/evals/tasks/aime/aime24.yaml: -------------------------------------------------------------------------------- 1 | handler: aime 2 | dataset_path: AI-MO/aimo-validation-aime 3 | dataset_split: train 4 | question_key: problem 5 | answer_key: answer 6 | templating_parameters: 7 | template: "Return your final response within \\boxed{{}}. {prompt}" 8 | preprocess_config: 9 | url: "2024" 10 | -------------------------------------------------------------------------------- /skythought/evals/tasks/aime/aime24_sky.yaml: -------------------------------------------------------------------------------- 1 | handler: aime 2 | dataset_path: AI-MO/aimo-validation-aime 3 | dataset_split: train 4 | question_key: problem 5 | answer_key: answer 6 | templating_parameters: 7 | template: "{prompt}\nReturn your final response within \\boxed{{}}" 8 | preprocess_config: 9 | url: "2024" -------------------------------------------------------------------------------- /skythought/evals/tasks/aime/aime25_1.yaml: -------------------------------------------------------------------------------- 1 | handler: aime 2 | dataset_path: opencompass/AIME2025 3 | dataset_subset: AIME2025-I 4 | dataset_split: test 5 | question_key: question 6 | answer_key: answer 7 | templating_parameters: 8 | template: "{prompt}\nReturn your final response within \\boxed{{}}" 9 | 10 | -------------------------------------------------------------------------------- /skythought/evals/tasks/aime/aime25_2.yaml: -------------------------------------------------------------------------------- 1 | handler: aime 2 | dataset_path: opencompass/AIME2025 3 | dataset_subset: AIME2025-II 4 | dataset_split: test 5 | question_key: question 6 | answer_key: answer 7 | templating_parameters: 8 | template: "{prompt}\nReturn your final response within \\boxed{{}}" 9 | 10 | -------------------------------------------------------------------------------- /skythought/evals/tasks/amc23/amc23.yaml: -------------------------------------------------------------------------------- 1 | handler: amc23 2 | dataset_path: AI-MO/aimo-validation-amc 3 | dataset_kwargs: 4 | trust_remote_code: true 5 | dataset_split: train 6 | question_key: problem 7 | answer_key: answer 8 | # Optionally, you can filter the dataset by difficulty 9 | # preprocess_config: 10 | # difficulty: easy 11 | templating_parameters: 12 | template: "Return your final response within \\boxed{{}}. {problem}" 13 | -------------------------------------------------------------------------------- /skythought/evals/tasks/amc23/amc23_handler.py: -------------------------------------------------------------------------------- 1 | from ..math.math_handler import MathTaskHandler 2 | 3 | 4 | class AMC23TaskHandler(MathTaskHandler): 5 | def load_and_filter_dataset( 6 | self, start, end, split=None, subset=None, difficulty=None 7 | ): 8 | train_data = self.load_dataset(subset=subset, split=split).to_pandas() 9 | filtered_data = train_data[train_data["url"].str.contains("2023", na=False)] 10 | return filtered_data.iloc[start:end] if end > 0 else filtered_data.iloc[start:] 11 | -------------------------------------------------------------------------------- /skythought/evals/tasks/apps/apps.yaml: -------------------------------------------------------------------------------- 1 | handler: apps 2 | dataset_path: codeparrot/apps 3 | dataset_subset: all 4 | dataset_kwargs: 5 | trust_remote_code: true 6 | dataset_split: test 7 | question_key: question 8 | answer_key: null 9 | # preprocess_config: 10 | # difficulty: null 11 | templating_parameters: 12 | with_fn_name_template: "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}" 13 | without_fn_name_template: "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}" 14 | # Add starter code on top of the initial template 15 | with_starter_code_template: "{input}\n{starter_code}" 16 | # Optionally, you can filter the dataset by difficulty 17 | # preprocess_config: 18 | # difficulty: easy 19 | -------------------------------------------------------------------------------- /skythought/evals/tasks/arc/arc_c.yaml: -------------------------------------------------------------------------------- 1 | handler: arc_c 2 | dataset_path: allenai/ai2_arc 3 | dataset_subset: ARC-Challenge 4 | dataset_split: train 5 | question_key: question 6 | answer_key: answerKey 7 | templating_parameters: 8 | # We combine choices for a question into choices_text entry in the dataset 9 | template: "Given the following question and four candidate answers (A, B, C and D), choose the best answer. Your response should end with \"The best answer is [the_answer_letter]\" where [the_answer_letter] is one of the four letter choice (A, B, C, or D).\n{question}\n{choices_text}" -------------------------------------------------------------------------------- /skythought/evals/tasks/gpqa_diamond/gpqa_diamond.yaml: -------------------------------------------------------------------------------- 1 | handler: gpqa_diamond 2 | dataset_path: Idavidrein/gpqa 3 | dataset_subset: gpqa_diamond 4 | dataset_split: train 5 | question_key: Question 6 | answer_key: Answer 7 | templating_parameters: 8 | # For GPQA, we combine the Question key and the multiple choice answers into a single `prompt` entry 9 | template: "Return your final response within \\boxed{{}} and only include the letter choice (A, B, C, or D) as your final response. {prompt}" -------------------------------------------------------------------------------- /skythought/evals/tasks/gsm8k/gsm8k.yaml: -------------------------------------------------------------------------------- 1 | handler: gsm8k 2 | dataset_path: "openai/gsm8k" 3 | dataset_subset: main 4 | dataset_split: test 5 | question_key: question 6 | answer_key: answer 7 | templating_parameters: 8 | template: "Given the following problem, reason and give a final answer to the problem.\nProblem: {question}\nYour response should end with \"The final answer is [answer]\" where [answer] is the response to the problem." 9 | 10 | -------------------------------------------------------------------------------- /skythought/evals/tasks/liveaops/liveaops.yaml: -------------------------------------------------------------------------------- 1 | handler: liveaops 2 | dataset_path: https://livemathbench.github.io/data/LiveAoPSBench-2024.jsonl 3 | dataset_subset: null # which subset on huggingface. Not applicable for a URL dataset 4 | dataset_split: null # Rule based evaluation 5 | question_key: question 6 | answer_key: answer 7 | templating_parameters: 8 | template: "Return your final response within \\boxed{{}}. {question}" 9 | -------------------------------------------------------------------------------- /skythought/evals/tasks/livecodebench/livecodebench.yaml: -------------------------------------------------------------------------------- 1 | handler: livecodebench 2 | dataset_path: "livecodebench/code_generation_lite" # repo ID in huggingface 3 | dataset_subset: null 4 | dataset_split: test 5 | dataset_kwargs: 6 | version_tag: release_v2 7 | trust_remote_code: true 8 | question_key: task_id 9 | answer_key: null 10 | templating_parameters: 11 | stdin_template: "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}" 12 | non_stdin_template: "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}" 13 | # Optionally, you can filter the dataset by difficulty 14 | # preprocess_config: 15 | # difficulty: easy 16 | -------------------------------------------------------------------------------- /skythought/evals/tasks/livecodebench/livecodebench_easy.yaml: -------------------------------------------------------------------------------- 1 | handler: livecodebench 2 | dataset_path: "livecodebench/code_generation_lite" # repo ID in huggingface 3 | dataset_subset: null 4 | dataset_split: test 5 | dataset_kwargs: 6 | version_tag: release_v2 7 | trust_remote_code: true 8 | question_key: task_id 9 | answer_key: null 10 | templating_parameters: 11 | stdin_template: "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}" 12 | non_stdin_template: "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}" 13 | preprocess_config: 14 | difficulty: easy 15 | -------------------------------------------------------------------------------- /skythought/evals/tasks/livecodebench/livecodebench_hard.yaml: -------------------------------------------------------------------------------- 1 | handler: livecodebench 2 | dataset_path: "livecodebench/code_generation_lite" # repo ID in huggingface 3 | dataset_subset: null 4 | dataset_split: test 5 | dataset_kwargs: 6 | version_tag: release_v2 7 | trust_remote_code: true 8 | question_key: task_id 9 | answer_key: null 10 | templating_parameters: 11 | stdin_template: "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}" 12 | non_stdin_template: "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}" 13 | preprocess_config: 14 | difficulty: hard 15 | -------------------------------------------------------------------------------- /skythought/evals/tasks/livecodebench/livecodebench_medium.yaml: -------------------------------------------------------------------------------- 1 | handler: livecodebench 2 | dataset_path: "livecodebench/code_generation_lite" # repo ID in huggingface 3 | dataset_subset: null 4 | dataset_split: test 5 | dataset_kwargs: 6 | version_tag: release_v2 7 | trust_remote_code: true 8 | question_key: task_id 9 | answer_key: null 10 | templating_parameters: 11 | stdin_template: "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}" 12 | non_stdin_template: "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}" 13 | preprocess_config: 14 | difficulty: medium 15 | -------------------------------------------------------------------------------- /skythought/evals/tasks/math/math500.yaml: -------------------------------------------------------------------------------- 1 | handler: math 2 | dataset_path: "qq8933/MATH500" # repo ID in huggingface 3 | dataset_subset: null # which subset on huggingface 4 | question_key: problem 5 | answer_key: answer 6 | dataset_split: test 7 | templating_parameters: 8 | template: "Return your final response within \\boxed{{}}. {problem}" 9 | # optional. Not supported yet. 10 | # fewshot_config: 11 | # - question: ... 12 | # - target: ... 13 | # num_fewshot: 0 14 | -------------------------------------------------------------------------------- /skythought/evals/tasks/minervamath/minervamath.yaml: -------------------------------------------------------------------------------- 1 | handler: math 2 | dataset_path: "svc-huggingface/minerva-math" # repo ID in huggingface 3 | dataset_subset: null # which subset on huggingface 4 | question_key: problem 5 | answer_key: solution 6 | dataset_split: test 7 | templating_parameters: 8 | template: "Return your final response within \\boxed{{}}. {problem}" -------------------------------------------------------------------------------- /skythought/evals/tasks/minervamath/minervamath_handler.py: -------------------------------------------------------------------------------- 1 | from skythought.evals.util.math_parsing_util import ( 2 | extract_answer, 3 | math_equal, 4 | strip_answer_string, 5 | ) 6 | 7 | from ..math.math_handler import MathTaskHandler 8 | 9 | 10 | class MinervaMathTaskHandler(MathTaskHandler): 11 | 12 | def check_correctness(self, problem, generation): 13 | answer = extract_answer(problem[self.task_config.answer_key]) 14 | answer = strip_answer_string(answer) 15 | 16 | pred = extract_answer(generation) 17 | pred = strip_answer_string(pred) 18 | return math_equal(pred, answer) 19 | -------------------------------------------------------------------------------- /skythought/evals/tasks/mmlu/mmlu.yaml: -------------------------------------------------------------------------------- 1 | handler: mmlu 2 | dataset_path: cais/mmlu 3 | dataset_subset: all 4 | dataset_split: test 5 | question_key: question 6 | answer_key: answer 7 | templating_parameters: 8 | template: "Return your final response within \\boxed{{}}. {prompt}" 9 | -------------------------------------------------------------------------------- /skythought/evals/tasks/mmlu/mmlu_pro.yaml: -------------------------------------------------------------------------------- 1 | handler: mmlu_pro 2 | dataset_path: TIGER-Lab/MMLU-Pro 3 | dataset_subset: default 4 | dataset_split: test 5 | question_key: question 6 | answer_key: answer 7 | templating_parameters: 8 | template: "Return your final response within \\boxed{{}}. {prompt}" 9 | -------------------------------------------------------------------------------- /skythought/evals/tasks/numina/numina.yaml: -------------------------------------------------------------------------------- 1 | handler: numina 2 | dataset_path: "AI-MO/NuminaMath-CoT" 3 | dataset_subset: null 4 | dataset_split: train 5 | question_key: problem 6 | answer_key: solution 7 | templating_parameters: 8 | template: "Return your final response within \\boxed{{}}. {prompt}" 9 | # Optionally, you can filter the dataset by difficulty 10 | # preprocess_config: 11 | # filter_difficulty: true 12 | # math_difficulty_lower_bound: 4 13 | # math_difficulty_upper_bound: 9 14 | # source: math 15 | -------------------------------------------------------------------------------- /skythought/evals/tasks/numina/numina_amc_aime.yaml: -------------------------------------------------------------------------------- 1 | handler: numina 2 | dataset_path: "AI-MO/NuminaMath-CoT" 3 | dataset_subset: null 4 | dataset_split: train 5 | question_key: problem 6 | answer_key: solution 7 | templating_parameters: 8 | template: "Return your final response within \\boxed{{}}. {prompt}" 9 | preprocess_config: 10 | filter_difficulty: true 11 | math_difficulty_lower_bound: 1 12 | math_difficulty_upper_bound: 9 13 | source: amc_aime 14 | -------------------------------------------------------------------------------- /skythought/evals/tasks/numina/numina_math.yaml: -------------------------------------------------------------------------------- 1 | handler: numina 2 | dataset_path: "AI-MO/NuminaMath-CoT" 3 | dataset_subset: null 4 | dataset_split: train 5 | question_key: problem 6 | answer_key: solution 7 | templating_parameters: 8 | template: "Return your final response within \\boxed{{}}. {prompt}" 9 | preprocess_config: 10 | filter_difficulty: true 11 | math_difficulty_lower_bound: 4 12 | math_difficulty_upper_bound: 9 13 | source: math 14 | -------------------------------------------------------------------------------- /skythought/evals/tasks/numina/numina_olympiads.yaml: -------------------------------------------------------------------------------- 1 | handler: numina 2 | dataset_path: "AI-MO/NuminaMath-CoT" 3 | dataset_subset: null 4 | dataset_split: train 5 | question_key: problem 6 | answer_key: solution 7 | templating_parameters: 8 | template: "Return your final response within \\boxed{{}}. {prompt}" 9 | preprocess_config: 10 | filter_difficulty: true 11 | math_difficulty_lower_bound: 9 12 | math_difficulty_upper_bound: 9 13 | source: olympiads 14 | -------------------------------------------------------------------------------- /skythought/evals/tasks/olympiadbench/olympiadbench_handler.py: -------------------------------------------------------------------------------- 1 | from skythought.evals.util.math_parsing_util import ( 2 | extract_answer, 3 | math_equal, 4 | strip_answer_string, 5 | ) 6 | 7 | from ..math.math_handler import MathTaskHandler 8 | 9 | 10 | class OlympiadBenchMathTaskHandler(MathTaskHandler): 11 | def check_correctness(self, problem, generation): 12 | # all problems have final answer in a list 13 | answer = strip_answer_string(problem[self.task_config.answer_key][0]) 14 | pred = extract_answer(generation) 15 | pred = strip_answer_string(pred) 16 | return math_equal(pred, answer) 17 | -------------------------------------------------------------------------------- /skythought/evals/tasks/olympiadbench/olympiadbench_math_en.yaml: -------------------------------------------------------------------------------- 1 | handler: olympiadbench_math 2 | dataset_path: Hothan/OlympiadBench 3 | dataset_subset: OE_TO_maths_en_COMP 4 | dataset_split: train 5 | question_key: question 6 | answer_key: final_answer 7 | templating_parameters: 8 | template: "Return your final response within \\boxed{{}}. {question}" 9 | -------------------------------------------------------------------------------- /skythought/evals/tasks/omni_math/omni_handler.py: -------------------------------------------------------------------------------- 1 | from skythought.evals.util.math_parsing_util import ( 2 | extract_answer, 3 | math_equal, 4 | strip_answer_string, 5 | ) 6 | 7 | from ..math.math_handler import MathTaskHandler 8 | 9 | 10 | class OMNIMathTaskHandler(MathTaskHandler): 11 | def generate_prompt(self, problem): 12 | return self.task_config.templating_parameters["template"].format(**problem) 13 | 14 | def check_correctness(self, problem, generation): 15 | # no preprocessing needed 16 | answer = problem[self.task_config.answer_key] 17 | pred = extract_answer(generation) 18 | pred = strip_answer_string(pred) 19 | return math_equal(pred, answer) 20 | -------------------------------------------------------------------------------- /skythought/evals/tasks/omni_math/omni_math.yaml: -------------------------------------------------------------------------------- 1 | handler: omni_math 2 | dataset_path: "KbsdJames/Omni-MATH" # repo ID in huggingface 3 | dataset_subset: null # which subset on huggingface 4 | dataset_split: test_rule_based # Rule based evaluation 5 | dataset_kwargs: 6 | # NOTE: This is using the subset for rule-based evaluation in the below PR 7 | revision: refs/pr/2 8 | question_key: problem 9 | answer_key: answer 10 | templating_parameters: 11 | template: "Return your final response within \\boxed{{}}. {problem}" -------------------------------------------------------------------------------- /skythought/evals/tasks/taco/taco.yaml: -------------------------------------------------------------------------------- 1 | handler: taco 2 | dataset_path: "BAAI/TACO" 3 | dataset_subset: MEDIUM 4 | dataset_split: train 5 | dataset_kwargs: 6 | trust_remote_code: true 7 | question_key: question 8 | answer_key: null 9 | templating_parameters: 10 | initial_template: "\nQUESTION:\n{prompt}" 11 | # Add starter code to initial template 12 | starter_code_template: "{input}\n{starter_code}" 13 | # stdin template is used when there is no starter code or fn_name 14 | stdin_template: "{input}\nUse Standard Input format\nANSWER:\n" 15 | # call template is used when there is starter code or fn_name 16 | call_template: "{input}\nUse Call-Based format\nANSWER:\n" 17 | # Optionally, you can filter the dataset by difficulty 18 | # preprocess_config: 19 | # difficulty: easy 20 | 21 | -------------------------------------------------------------------------------- /skythought/evals/tasks/task_util.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from typing import Dict 4 | 5 | 6 | def get_tasks(task_root_dir: str) -> Dict[str, str]: 7 | """Returns a dictionary of task names and their corresponding yaml file paths""" 8 | # list all yamls in subdirectories 9 | name_to_yaml = {} 10 | for yaml_file in glob.glob( 11 | os.path.join(task_root_dir, "**", "*.yaml"), recursive=True 12 | ): 13 | # arc.yaml -> arc 14 | name = os.path.basename(yaml_file).split(".")[0] 15 | 16 | name_to_yaml[name] = yaml_file 17 | 18 | return name_to_yaml 19 | -------------------------------------------------------------------------------- /skythought/evals/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/evals/util/__init__.py -------------------------------------------------------------------------------- /skythought/evals/util/results.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import asdict, dataclass 3 | from pathlib import Path 4 | from typing import Any, Dict, Optional 5 | 6 | 7 | @dataclass 8 | class SummaryResults: 9 | configuration: Dict[str, Any] 10 | total_completion_tokens: int = 0 11 | avg_completion_tokens: float = 0 12 | total_prompt_tokens: int = 0 13 | avg_prompt_tokens: float = 0 14 | accuracy: float = 0.0 15 | pass_at_k: Optional[Dict[str, float]] = None 16 | 17 | def to_json_dict(self) -> Dict[str, Any]: 18 | """Convert to a JSON-compatible dictionary.""" 19 | return asdict(self) 20 | 21 | 22 | def save_summary(summary_path: Path, summary: SummaryResults) -> None: 23 | with open(summary_path, "w", encoding="utf-8") as f: 24 | json.dump(summary.to_json_dict(), f, indent=4) 25 | -------------------------------------------------------------------------------- /skythought/skythought-rl/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.8" 10 | 11 | sphinx: 12 | configuration: docs/conf.py 13 | 14 | python: 15 | install: 16 | - requirements: docs/requirements-docs.txt -------------------------------------------------------------------------------- /skythought/skythought-rl/.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = google 3 | column_limit = 120 4 | indent_width = 4 5 | split_arguments_when_comma_terminated: true -------------------------------------------------------------------------------- /skythought/skythought-rl/Notice.txt: -------------------------------------------------------------------------------- 1 | Copyright 2023-2024 Bytedance Ltd. and/or its affiliates -------------------------------------------------------------------------------- /skythought/skythought-rl/README.md: -------------------------------------------------------------------------------- 1 | ### Install veRL: 2 | 1. Create a conda environment: 3 | 4 | ```bash 5 | conda create -n verl python==3.9 6 | conda activate verl 7 | pip install -r requirements.txt 8 | ``` 9 | 10 | 2. Install common dependencies (required for all backends) 11 | 12 | ```bash 13 | pip3 install vllm==0.6.3 # or you can install 0.5.4, 0.4.2 and 0.3.1 14 | pip3 install ray 15 | 16 | # flash attention 2 17 | pip3 install flash-attn --no-build-isolation 18 | ``` 19 | 20 | 3. Install veRL 21 | 22 | ```bash 23 | pip3 install -e . 24 | ``` 25 | 26 | ### Prepare the data 27 | `python data/data_prepare_*.py --output {corresponding path}` 28 | 29 | ### Launch the training 30 | ```bash 31 | cd examples/sky-t1 32 | bash ./run-sky-t1-7b-zero.sh 33 | ``` 34 | 35 | 36 | ### Acknowledgement 37 | This repo is modified on top of [VeRL](https://github.com/volcengine/verl) and [PRIME](https://github.com/PRIME-RL/PRIME). 38 | -------------------------------------------------------------------------------- /skythought/skythought-rl/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = verl 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /skythought/skythought-rl/docs/README.md: -------------------------------------------------------------------------------- 1 | # veRL documents 2 | 3 | ## Build the docs 4 | 5 | ```bash 6 | # Install dependencies. 7 | pip install -r requirements-docs.txt 8 | 9 | # Build the docs. 10 | make clean 11 | make html 12 | ``` 13 | 14 | ## Open the docs with your browser 15 | 16 | ```bash 17 | python -m http.server -d _build/html/ 18 | ``` 19 | Launch your browser and open localhost:8000. -------------------------------------------------------------------------------- /skythought/skythought-rl/docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/skythought-rl/docs/_static/logo.png -------------------------------------------------------------------------------- /skythought/skythought-rl/docs/advance/placement.rst: -------------------------------------------------------------------------------- 1 | Ray API Design Tutorial 2 | ======================================= 3 | 4 | We provide a tutorial for our Ray API design, including: 5 | 6 | - Ray basic concepts 7 | - Resource Pool and RayWorkerGroup 8 | - Data Dispatch, Execution and Collection 9 | - Initialize the RayWorkerGroup and execute the distributed computation in the given Resource Pool 10 | 11 | See details in `tutorial.ipynb `_. -------------------------------------------------------------------------------- /skythought/skythought-rl/docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | # markdown suport 2 | recommonmark 3 | # markdown table suport 4 | sphinx-markdown-tables 5 | 6 | # theme default rtd 7 | 8 | # crate-docs-theme 9 | sphinx-rtd-theme -------------------------------------------------------------------------------- /skythought/skythought-rl/examples/generation/run_deepseek_v2_lite_math.sh: -------------------------------------------------------------------------------- 1 | python3 -m verl.trainer.main_generation \ 2 | trainer.nnodes=1 \ 3 | trainer.n_gpus_per_node=8 \ 4 | data.path=~/data/rlhf/gsm8k/test.parquet \ 5 | data.prompt_key=prompt \ 6 | data.n_samples=1 \ 7 | data.output_path=~/data/rlhf/math/deepseek_v2_lite_gen_test.parquet \ 8 | model.path=deepseek-ai/deepseek-llm-7b-chat \ 9 | +model.trust_remote_code=True \ 10 | rollout.temperature=1.0 \ 11 | rollout.top_k=50 \ 12 | rollout.top_p=0.7 \ 13 | rollout.prompt_length=2048 \ 14 | rollout.response_length=1024 \ 15 | rollout.tensor_model_parallel_size=2 \ 16 | rollout.gpu_memory_utilization=0.8 17 | -------------------------------------------------------------------------------- /skythought/skythought-rl/examples/sft/gsm8k/run_deepseek_6b7.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | hdfs_path=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ # replace to your own hdfs/local path 4 | 5 | nproc_per_node=$1 6 | 7 | torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ 8 | -m verl.trainer.fsdp_sft_trainer \ 9 | data.train_files=$HOME/data/gsm8k/train.parquet \ 10 | data.val_files=$HOME/data/gsm8k/test.parquet \ 11 | data.prompt_key=prompt \ 12 | data.response_key=answer \ 13 | data.micro_batch_size=8 \ 14 | model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \ 15 | trainer.default_hdfs_dir=$hdfs_path \ 16 | trainer.project_name=gsm8k-sft \ 17 | trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \ 18 | trainer.total_epochs=4 \ 19 | trainer.logger=['console','wandb'] -------------------------------------------------------------------------------- /skythought/skythought-rl/examples/sft/gsm8k/run_gemma_7b.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | hdfs_path=hdfs://user/verl/experiments/gsm8k/gemma-1.1-7b-it/ # replace to your own hdfs/local path 4 | 5 | nproc_per_node=$1 6 | 7 | torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ 8 | -m verl.trainer.fsdp_sft_trainer \ 9 | data.train_files=$HOME/data/gsm8k/train.parquet \ 10 | data.val_files=$HOME/data/gsm8k/test.parquet \ 11 | data.prompt_key=prompt \ 12 | data.response_key=answer \ 13 | data.micro_batch_size=8 \ 14 | model.partial_pretrain=google/gemma-1.1-7b-it \ 15 | trainer.default_hdfs_dir=$hdfs_path \ 16 | trainer.project_name=gsm8k-sft \ 17 | trainer.experiment_name=gsm8k-sft-gemma-1.1-7b-it \ 18 | trainer.total_epochs=4 \ 19 | trainer.logger=['console','wandb'] -------------------------------------------------------------------------------- /skythought/skythought-rl/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | codetiming 3 | datasets 4 | dill 5 | hydra-core 6 | numpy 7 | pybind11 8 | ray 9 | tensordict<0.6 10 | transformers 11 | vllm<=0.6.3 12 | wandb 13 | pyext 14 | word2number 15 | pylatexenc -------------------------------------------------------------------------------- /skythought/skythought-rl/tests/ray/detached_worker/README.md: -------------------------------------------------------------------------------- 1 | # Detached Worker 2 | ## How to run (Only on a single node) 3 | - Start a local ray cluster: 4 | ```bash 5 | ray start --head --port=6379 6 | ``` 7 | - Run the server 8 | ```bash 9 | python3 server.py 10 | ``` 11 | - On another terminal, Run the client 12 | ```bash 13 | python3 client.py 14 | ``` 15 | -------------------------------------------------------------------------------- /skythought/skythought-rl/tests/ray/detached_worker/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ray start --head --port=6379 3 | python3 server.py 4 | python3 client.py 5 | ray stop --force -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/models/llama/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/models/llama/megatron/checkpoint_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/models/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/single_controller/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__))) 18 | 19 | with open(os.path.join(os.path.join(version_folder, os.pardir), 'version/version')) as f: 20 | __version__ = f.read().strip() 21 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/single_controller/base/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .worker import Worker 16 | from .worker_group import WorkerGroup, ClassWithInitArgs, ResourcePool 17 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/single_controller/base/megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/single_controller/base/register_center/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/single_controller/ray/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup, create_colocated_worker_cls 16 | from .megatron import (MegatronRayWorkerGroup, DistRankInfo, DistGlobalInfo) -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/third_party/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/third_party/vllm/vllm_v_0_3_1/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/third_party/vllm/vllm_v_0_4_2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/third_party/vllm/vllm_v_0_5_4/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/third_party/vllm/vllm_v_0_6_3/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/trainer/config/evaluation.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | path: /tmp/math_Qwen2-7B-Instruct.parquet 3 | prompt_key: prompt 4 | response_key: responses 5 | data_source_key: data_source 6 | reward_model_key: reward_model -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/trainer/ppo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/trainer/runtime_env.yaml: -------------------------------------------------------------------------------- 1 | working_dir: ./ 2 | excludes: ["/.git/"] 3 | env_vars: 4 | TORCH_NCCL_AVOID_RECORD_STREAMS: "1" 5 | VLLM_ATTENTION_BACKEND: "XFORMERS" -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import tokenizer 16 | from .tokenizer import * 17 | 18 | __all__ = tokenizer.__all__ -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/utils/dataset/README.md: -------------------------------------------------------------------------------- 1 | # Dataset Format 2 | ## RLHF dataset 3 | We combine all the data sources into a single parquet files. We directly organize the prompt into the chat format so that multi-turn chats can be easily incorporated. In the prompt, we may add instruction following texts to guide the model output the answers in a particular format so that we can extract the answers. 4 | 5 | Math problems 6 | ```json 7 | { 8 | "data_source": "openai/gsm8k", 9 | "prompt": [{"role": "user", "content": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Let's think step by step and output the final answer after \"####\""}], 10 | "ability": "math", 11 | "reward_model": { 12 | "style": "rule", 13 | "ground_truth": ["72"] 14 | }, 15 | } 16 | ``` 17 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/utils/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .rl_dataset import RLHFDataset 16 | from .rm_dataset import RMDataset 17 | from .sft_dataset import SFTDataset 18 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/utils/debug/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .performance import log_gpu_memory_usage -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/utils/logger/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/utils/megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/utils/rendezvous/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/utils/reward_score/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/version/version: -------------------------------------------------------------------------------- 1 | 0.1 -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/workers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/workers/actor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .base import BasePPOActor 16 | from .dp_actor import DataParallelPPOActor 17 | 18 | __all__ = ["BasePPOActor", "DataParallelPPOActor"] 19 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/workers/critic/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .base import BasePPOCritic 16 | from .dp_critic import DataParallelPPOCritic 17 | 18 | __all__ = ["BasePPOCritic", "DataParallelPPOCritic"] 19 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/workers/reward_model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .base import BasePPORewardModel 16 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/workers/reward_model/megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .reward_model import MegatronRewardModel 16 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/workers/rollout/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .base import BaseRollout 16 | from .naive import NaiveRollout 17 | from .hf_rollout import HFRollout 18 | 19 | __all__ = ["BaseRollout", "NaiveRollout", "HFRollout"] 20 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/workers/rollout/naive/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .naive_rollout import NaiveRollout 16 | -------------------------------------------------------------------------------- /skythought/skythought-rl/verl/workers/rollout/vllm_rollout/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .vllm_rollout import vLLMRollout -------------------------------------------------------------------------------- /skythought/test-time-scaling/assets/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/test-time-scaling/assets/figure1.png -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/4o_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --method naive_nodspy \ 10 | --lcb_version release_v2 \ 11 | --result_json_path="results/baselines_4o_mini_${difficulty}.json" \ 12 | 13 | done 14 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/4o_mini_cct.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /root/miniconda3/etc/profile.d/conda.sh 4 | conda activate sstar 5 | 6 | python codecontest_evaluate_multiprocess.py \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --method naive_nodspy \ 10 | --generator 4o \ 11 | --result_json_path="results/baselines_4o_codecontest.json" 12 | 13 | 14 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/o1_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator o1-mini \ 10 | --method naive_nodspy \ 11 | --lcb_version release_v2 \ 12 | --result_json_path="results/baselines_o1_mini_${difficulty}.json" \ 13 | 14 | done 15 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/o1_preview.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator o1-preview \ 10 | --method naive_nodspy \ 11 | --lcb_version release_v2 \ 12 | --result_json_path="results/baselines_o1_preview_${difficulty}.json" \ 13 | 14 | done 15 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/o3_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator o3-mini \ 10 | --method naive_nodspy \ 11 | --lcb_version release_v2 \ 12 | --result_json_path="results/baselines_o3_mini_${difficulty}.json" \ 13 | 14 | done 15 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/qwen0.5b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator qwen0.5b \ 10 | --api_name Qwen/Qwen2.5-Coder-0.5B-Instruct \ 11 | --api_base http://localhost:8000/v1 \ 12 | --method naive_nodspy \ 13 | --lcb_version release_v2 \ 14 | --result_json_path="results/baselines_qwen0.5b_${difficulty}.json" \ 15 | 16 | done 17 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/qwen1.5b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator qwen1.5b \ 10 | --api_name Qwen/Qwen2.5-Coder-1.5B-Instruct \ 11 | --api_base http://localhost:8000/v1 \ 12 | --method naive_nodspy \ 13 | --lcb_version release_v2 \ 14 | --result_json_path="results/baselines_qwen1.5b_${difficulty}.json" \ 15 | 16 | done 17 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/qwen14b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator qwen14b \ 10 | --api_name Qwen/Qwen2.5-Coder-14B-Instruct \ 11 | --api_base http://localhost:8000/v1 \ 12 | --method naive_nodspy \ 13 | --lcb_version release_v2 \ 14 | --result_json_path="results/baselines_qwen14b_${difficulty}.json" \ 15 | 16 | done 17 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/qwen32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator qwen32b \ 10 | --api_name Qwen/Qwen2.5-Coder-32B-Instruct \ 11 | --api_base http://localhost:8000/v1 \ 12 | --method naive_nodspy \ 13 | --lcb_version release_v2 \ 14 | --result_json_path="results/baselines_qwen32b_${difficulty}.json" \ 15 | 16 | done 17 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/qwen3b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator qwen3b \ 10 | --api_name Qwen/Qwen2.5-Coder-3B-Instruct \ 11 | --api_base http://localhost:8000/v1 \ 12 | --method naive_nodspy \ 13 | --lcb_version release_v2 \ 14 | --result_json_path="results/baselines_qwen3b_${difficulty}.json" \ 15 | 16 | done 17 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/qwen7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator qwen7b \ 10 | --api_name Qwen/Qwen2.5-Coder-7B-Instruct \ 11 | --api_base http://localhost:8000/v1 \ 12 | --method naive_nodspy \ 13 | --lcb_version release_v2 \ 14 | --result_json_path="results/baselines_qwen7b_${difficulty}.json" \ 15 | 16 | done 17 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/qwq32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator qwen32b \ 10 | --api_name Qwen/QwQ-32B-Preview \ 11 | --api_base http://localhost:8000/v1 \ 12 | --no_dspy_gen \ 13 | --method naive_nodspy \ 14 | --lcb_version release_v2 \ 15 | --result_json_path="results/baselines_qwq32b_${difficulty}.json" \ 16 | 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/r1qwen14b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator r1qwen32b \ 10 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 11 | --api_base http://localhost:8000/v1 \ 12 | --no_dspy_gen \ 13 | --method naive_nodspy \ 14 | --lcb_version release_v2 \ 15 | --result_json_path="results/baselines_r1qwen14b_${difficulty}.json" \ 16 | 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/r1qwen32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator r1qwen32b \ 10 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \ 11 | --api_base http://localhost:8000/v1 \ 12 | --no_dspy_gen \ 13 | --method naive_nodspy \ 14 | --lcb_version release_v2 \ 15 | --result_json_path="results/baselines_r1qwen32b_${difficulty}.json" \ 16 | 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines/r1qwen7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --generator r1qwen32b \ 10 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ 11 | --api_base http://localhost:8000/v1 \ 12 | --no_dspy_gen \ 13 | --method naive_nodspy \ 14 | --lcb_version release_v2 \ 15 | --result_json_path="results/baselines_r1qwen7b_${difficulty}.json" \ 16 | 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/4o_mini_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=1.0 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --lcb_version release_v2 \ 13 | --ablation_qwq_vanilla_without_reasoning \ 14 | --ablation_qwq_debug_with_4o_mini \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_4o_mini_n_1_debug_public3_select_random_${difficulty}.json" 17 | 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/o1_mini_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --no_dspy_gen \ 12 | --generator=o1-mini \ 13 | --selection=random \ 14 | --lcb_version release_v2 \ 15 | --ablation_qwq_vanilla_without_reasoning \ 16 | --ablation_qwq_debug_with_4o_mini \ 17 | --num_round ${MAX_ROUND} \ 18 | --result_json_path="results/final_o1_mini_n_1_debug_public3_select_random_${difficulty}.json" \ 19 | 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/qwen0.5b_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-0.5B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --no_dspy_gen \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen0.5b_n_1_debug_public3_select_random_${difficulty}.json" 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/qwen1.5b_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-1.5B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --no_dspy_gen \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen1.5b_n_1_debug_public3_select_random_${difficulty}.json" 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/qwen14b_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-14B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen14b_n_1_debug_public3_select_random_${difficulty}.json" 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/qwen32b_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen32b_n_1_debug_public3_select_random_${difficulty}.json" 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/qwen3b_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-3B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --no_dspy_gen \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen3b_n_1_debug_public3_select_random_${difficulty}.json" 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/qwen7b_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen7b_n_1_debug_public3_select_random_${difficulty}.json" 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/qwq32b_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --api_name Qwen/QwQ-32B-Preview \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --no_dspy_gen \ 17 | --ablation_qwq_vanilla_without_reasoning \ 18 | --ablation_qwq_debug_with_4o_mini \ 19 | --result_json_path="results/final_qwq32b_n_1_debug_public3_select_random_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/r1qwen14b_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --no_dspy_gen \ 17 | --ablation_qwq_vanilla_without_reasoning \ 18 | --ablation_qwq_debug_with_4o_mini \ 19 | --result_json_path="results/final_r1qwen14b_n_1_debug_public3_select_random_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/r1qwen32b_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --no_dspy_gen \ 17 | --ablation_qwq_vanilla_without_reasoning \ 18 | --ablation_qwq_debug_with_4o_mini \ 19 | --result_json_path="results/final_r1qwen32b_n_1_debug_public3_select_random_${difficulty}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/baselines_selfdebug/r1qwen7b_n_1_debug_public3_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=1 \ 11 | --selection=random \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --no_dspy_gen \ 17 | --ablation_qwq_vanilla_without_reasoning \ 18 | --ablation_qwq_debug_with_4o_mini \ 19 | --result_json_path="results/final_r1qwen7b_n_1_debug_public3_select_random_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/4omini_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --lcb_version release_v2 \ 13 | --num_round ${MAX_ROUND} \ 14 | --result_json_path="results/final_4omini_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 15 | --load_cached_preds \ 16 | --cached_preds_path="results/final_4omini_n_16_debug_public3_select_oracle_${difficulty}.json" 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/batch_small_models_first.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Starting Qwen-0.5B evaluation..." 4 | bash scripts/final_first_cached/qwen0.5b_n_16_debug_public3_select_first_cached.sh 5 | 6 | echo "Starting Qwen-1.5B evaluation..." 7 | bash scripts/final_first_cached/qwen1.5b_n_16_debug_public3_select_first_cached.sh 8 | 9 | echo "Starting Qwen-3B evaluation..." 10 | bash scripts/final_first_cached/qwen3b_n_16_debug_public3_select_first_cached.sh 11 | 12 | echo "All evaluations completed!" 13 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/o1mini_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --lcb_version release_v2 \ 13 | --num_round ${MAX_ROUND} \ 14 | --result_json_path="results/final_o1mini_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 15 | --load_cached_preds \ 16 | --cached_preds_path="results/final_o1mini_n_16_debug_public3_select_oracle_${difficulty}.json" 17 | done 18 | 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/qwen0.5b_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-0.5B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen0.5b_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 17 | --load_cached_preds \ 18 | --cached_preds_path="results/final_qwen0.5b_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/qwen1.5b_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-1.5B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen1.5b_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 17 | --load_cached_preds \ 18 | --cached_preds_path="results/final_qwen1.5b_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/qwen14b_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-14B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen14b_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 17 | --load_cached_preds \ 18 | --cached_preds_path="results/final_qwen14b_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done 20 | 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/qwen32b_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen32b_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 17 | --load_cached_preds \ 18 | --cached_preds_path="results/final_qwen32b_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/qwen3b_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-3B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen3b_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 17 | --load_cached_preds \ 18 | --cached_preds_path="results/final_qwen3b_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/qwen7b_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen7b_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 17 | --load_cached_preds \ 18 | --cached_preds_path="results/final_qwen7b_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/r1qwen14b_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_r1qwen14b_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 17 | --load_cached_preds \ 18 | --cached_preds_path="results/final_r1qwen14b_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done 20 | 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/r1qwen32b_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_r1qwen32b_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 17 | --load_cached_preds \ 18 | --cached_preds_path="results/final_r1qwen32b_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done 20 | 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_first_cached/r1qwen7b_n_16_debug_public3_select_first_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=first \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_r1qwen7b_n_16_debug_public3_select_first_cached_${difficulty}.json" \ 17 | --load_cached_preds \ 18 | --cached_preds_path="results/final_r1qwen7b_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done 20 | 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/4omini_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_notimeout \ 12 | --test_generator 4o-mini \ 13 | --lcb_version release_v2 \ 14 | --num_round ${MAX_ROUND} \ 15 | --result_json_path="results/final_4omini_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 16 | --load_cached_preds \ 17 | --cached_preds_path="results/final_4omini_n_16_debug_public3_select_oracle_${difficulty}.json" 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/batch_small_models_gentest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Starting Qwen-0.5B evaluation..." 4 | bash scripts/final_gentest_notimeout_cached/qwen0.5b_n_16_debug_public3_select_4omini_cached.sh 5 | 6 | echo "Starting Qwen-1.5B evaluation..." 7 | bash scripts/final_gentest_notimeout_cached/qwen1.5b_n_16_debug_public3_select_4omini_cached.sh 8 | 9 | echo "Starting Qwen-3B evaluation..." 10 | bash scripts/final_gentest_notimeout_cached/qwen3b_n_16_debug_public3_select_4omini_cached.sh 11 | 12 | echo "All evaluations completed!" -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/o1mini_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_no_timeout \ 12 | --test_generator 4o-mini \ 13 | --lcb_version release_v2 \ 14 | --num_round ${MAX_ROUND} \ 15 | --result_json_path="results/final_o1mini_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 16 | --load_cached_preds \ 17 | --cached_preds_path="results/final_o1mini_n_16_debug_public3_select_oracle_${difficulty}.json" 18 | done 19 | 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/qwen0.5b_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_no_timeout \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-0.5B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen0.5b_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen0.5b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/qwen1.5b_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_no_timeout \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-1.5B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen1.5b_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen1.5b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/qwen14b_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_no_timeout \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-14B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen14b_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen14b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/qwen32b_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_no_timeout \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen32b_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen32b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/qwen3b_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_no_timeout \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-3B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen3b_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen3b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/qwen7b_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_no_timeout \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen7b_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen7b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/r1qwen14b_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_no_timeout \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_r1qwen14b_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_r1qwen14b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/r1qwen32b_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_no_timeout \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_r1qwen32b_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_r1qwen32b_n_16_debug_public3_oracle_${difficulty}.json" 20 | done 21 | 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_gentest_notimeout_cached/r1qwen7b_n_16_debug_public3_select_4omini_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_no_timeout \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_r1qwen7b_n_16_debug_public3_select_4omini_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_r1qwen7b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/4omini_n_16_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection oracle_all_rounds \ 12 | --lcb_version release_v2 \ 13 | --num_round ${MAX_ROUND} \ 14 | --result_json_path="results/final_4omini_n_16_debug_public3_select_oracle_${difficulty}.json" 15 | done 16 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/o1mini_n_16_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=1.0 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --generator o1-mini \ 12 | --selection oracle_all_rounds \ 13 | --lcb_version release_v2 \ 14 | --num_round ${MAX_ROUND} \ 15 | --ablation_qwq_vanilla_without_reasoning \ 16 | --ablation_qwq_debug_with_4o_mini \ 17 | --no_dspy_gen \ 18 | --result_json_path="results/final_o1mini_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/qwen0.5b_n_16_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --no_dspy_gen \ 12 | --api_name Qwen/Qwen2.5-Coder-0.5B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --selection oracle_all_rounds \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen0.5b_n_16_debug_public3_select_oracle_${difficulty}.json" 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/qwen1.5b_n_32_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --no_dspy_gen \ 12 | --api_name Qwen/Qwen2.5-Coder-1.5B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --selection oracle_all_rounds \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen1.5b_n_16_debug_public3_select_oracle_${difficulty}.json" 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/qwen14b_n_16_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --api_name openai/Qwen/Qwen2.5-Coder-14B-Instruct \ 12 | --api_base http://localhost:8000/v1 \ 13 | --selection oracle_all_rounds \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen14b_n_16_debug_public3_select_oracle_${difficulty}.json" 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/qwen32b_n_16_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=16 \ 11 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 12 | --api_base http://localhost:8000/v1 \ 13 | --selection oracle_all_rounds \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen32b_n_16_debug_public3_select_oracle_${difficulty}.json" 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/qwen32b_n_16_debug_public3_select_oracle_icl_patterns.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in medium 5 | do 6 | num_icl=1 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.7 \ 10 | --num_threads=16 \ 11 | --n=16 \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --selection oracle_all_rounds \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --num_icl_examples ${num_icl} \ 18 | --icl_retriever pattern \ 19 | --result_json_path="results_final/final_qwen32b_n_16_debug_public3_select_oracle_icl_${num_icl}_patterns_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/qwen3b_n_32_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --no_dspy_gen \ 12 | --api_name Qwen/Qwen2.5-Coder-3B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --selection oracle_all_rounds \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen3b_n_16_debug_public3_select_oracle_${difficulty}.json" 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/qwen7b_n_16_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 12 | --api_base http://localhost:8000/v1 \ 13 | --selection oracle_all_rounds \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_qwen7b_n_16_debug_public3_select_oracle_${difficulty}.json" 17 | done 18 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/qwq32b_n_16_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=16 \ 11 | --api_name Qwen/QwQ-32B-Preview \ 12 | --api_base http://localhost:8000/v1 \ 13 | --lcb_version release_v2 \ 14 | --num_round ${MAX_ROUND} \ 15 | --selection oracle_all_rounds \ 16 | --no_dspy_gen \ 17 | --ablation_qwq_vanilla_without_reasoning \ 18 | --ablation_qwq_debug_with_4o_mini \ 19 | --result_json_path="results/final_qwq32b_n_16_debug_public3_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/r1qwen14b_n_16_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 12 | --api_base http://localhost:8000/v1 \ 13 | --lcb_version release_v2 \ 14 | --num_round ${MAX_ROUND} \ 15 | --selection oracle_all_rounds \ 16 | --no_dspy_gen \ 17 | --ablation_qwq_vanilla_without_reasoning \ 18 | --ablation_qwq_debug_with_4o_mini \ 19 | --result_json_path="results/final_r1qwen14b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/r1qwen32b_n_16_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=16 \ 11 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \ 12 | --api_base http://localhost:8000/v1 \ 13 | --lcb_version release_v2 \ 14 | --num_round ${MAX_ROUND} \ 15 | --selection oracle_all_rounds \ 16 | --no_dspy_gen \ 17 | --ablation_qwq_vanilla_without_reasoning \ 18 | --ablation_qwq_debug_with_4o_mini \ 19 | --result_json_path="results/final_r1qwen32b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_oracle/r1qwen7b_n_16_debug_public3_select_oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ 12 | --api_base http://localhost:8000/v1 \ 13 | --lcb_version release_v2 \ 14 | --num_round ${MAX_ROUND} \ 15 | --selection oracle_all_rounds \ 16 | --no_dspy_gen \ 17 | --ablation_qwq_vanilla_without_reasoning \ 18 | --ablation_qwq_debug_with_4o_mini \ 19 | --result_json_path="results/final_r1qwen7b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/4omini_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --lcb_version release_v2 \ 14 | --num_round ${MAX_ROUND} \ 15 | --result_json_path="results/final_4omini_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 16 | --load_cached_preds \ 17 | --cached_preds_path="results/final_4omini_n_16_debug_public3_select_oracle_${difficulty}.json" 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/batch_small_models_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Starting Qwen-0.5B evaluation..." 4 | bash scripts/final_random_cached/qwen0.5b_n_16_debug_public3_select_random_cached.sh 5 | 6 | echo "Starting Qwen-1.5B evaluation..." 7 | bash scripts/final_random_cached/qwen1.5b_n_16_debug_public3_select_random_cached.sh 8 | 9 | echo "Starting Qwen-3B evaluation..." 10 | bash scripts/final_random_cached/qwen3b_n_16_debug_public3_select_random_cached.sh 11 | 12 | echo "All evaluations completed!" -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/o1mini_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --test_generator 4o-mini \ 14 | --lcb_version release_v2 \ 15 | --num_round ${MAX_ROUND} \ 16 | --result_json_path="results/final_o1mini_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 17 | --load_cached_preds \ 18 | --cached_preds_path="results/final_o1mini_n_16_debug_public3_select_oracle_${difficulty}.json" 19 | done 20 | 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/qwen0.5b_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --api_name openai/Qwen/Qwen2.5-Coder-0.5B-Instruct \ 14 | --api_base http://localhost:8000/v1 \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen0.5b_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen0.5b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/qwen1.5b_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --api_name openai/Qwen/Qwen2.5-Coder-1.5B-Instruct \ 14 | --api_base http://localhost:8000/v1 \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen1.5b_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen1.5b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/qwen14b_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --api_name openai/Qwen/Qwen2.5-Coder-14B-Instruct \ 14 | --api_base http://localhost:8000/v1 \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen14b_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen14b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/qwen32b_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 14 | --api_base http://localhost:8000/v1 \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen32b_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen32b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/qwen3b_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --api_name openai/Qwen/Qwen2.5-Coder-3B-Instruct \ 14 | --api_base http://localhost:8000/v1 \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen3b_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen3b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/qwen7b_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 14 | --api_base http://localhost:8000/v1 \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen7b_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen7b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/r1qwen14b_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 14 | --api_base http://localhost:8000/v1 \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_r1qwen14b_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_r1qwen14b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/r1qwen32b_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \ 14 | --api_base http://localhost:8000/v1 \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_r1qwen32b_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_r1qwen32b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_random_cached/r1qwen7b_n_16_debug_public3_select_random_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=random \ 12 | --seed=40 \ 13 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ 14 | --api_base http://localhost:8000/v1 \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_r1qwen7b_n_16_debug_public3_select_random_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_r1qwen7b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/4omini_n_16_debug_public3_select_tool_assisted_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_tool_assisted \ 12 | --test_generator 4o-mini \ 13 | --lcb_version release_v2 \ 14 | --num_round ${MAX_ROUND} \ 15 | --result_json_path="results/final_4omini_n_16_debug_public3_select_tool_assisted_cached_${difficulty}.json" \ 16 | --load_cached_preds \ 17 | --cached_preds_path="results/final_4omini_n_16_debug_public3_select_oracle_${difficulty}.json" 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/batch_small_models_tool_assisted.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Starting Qwen-0.5B evaluation..." 4 | bash scripts/final_tool_assisted_cached/qwen0.5b_n_16_debug_public3_select_tool_assisted_cached.sh 5 | 6 | echo "Starting Qwen-1.5B evaluation..." 7 | bash scripts/final_tool_assisted_cached/qwen1.5b_n_16_debug_public3_select_tool_assisted_cached.sh 8 | 9 | echo "Starting Qwen-3B evaluation..." 10 | bash scripts/final_tool_assisted_cached/qwen3b_n_16_debug_public3_select_tool_assisted_cached.sh 11 | 12 | echo "All evaluations completed!" -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/o1mini_n_16_debug_public3_select_tool_assisted_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=16 \ 11 | --selection=generated_tests_tool_assisted \ 12 | --test_generator 4o-mini \ 13 | --lcb_version release_v2 \ 14 | --num_round ${MAX_ROUND} \ 15 | --result_json_path="results/final_o1mini_n_16_debug_public3_select_tool_assisted_cached_${difficulty}.json" \ 16 | --load_cached_preds \ 17 | --cached_preds_path="results/final_o1mini_n_16_debug_public3_select_oracle_${difficulty}.json" 18 | done 19 | 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/qwen0.5b_n_16_debug_public3_select_tool_assisted_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_tool_assisted \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-0.5B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen0.5b_n_16_debug_public3_select_tool_assisted_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen0.5b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/qwen1.5b_n_16_debug_public3_select_tool_assisted_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_tool_assisted \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-1.5B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen1.5b_n_16_debug_public3_select_tool_assisted_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen1.5b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/qwen14b_n_16_debug_public3_select_tool_assisted_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_tool_assisted \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-14B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen14b_n_16_debug_public3_select_tool_assisted_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen14b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/qwen32b_n_16_debug_public3_select_tool_assisted_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_tool_assisted \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen32b_n_16_debug_public3_select_tool_assisted_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen32b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/qwen3b_n_16_debug_public3_select_tool_assisted_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_tool_assisted \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-3B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen3b_n_16_debug_public3_select_tool_assisted_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen3b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/qwen7b_n_16_debug_public3_select_tool_assisted_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_tool_assisted \ 12 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_qwen7b_n_16_debug_public3_select_tool_assisted_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_qwen7b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/r1qwen14b_n_16_debug_public3_select_tool_assisted_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=16 \ 11 | --selection=generated_tests_tool_assisted \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_r1qwen14b_n_16_debug_public3_select_tool_assisted_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_r1qwen14b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/final_tool_assisted_cached/r1qwen7b_n_16_debug_public3_select_tool_assisted_cached.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=3 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=32 \ 10 | --n=16 \ 11 | --selection=generated_tests_tool_assisted \ 12 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ 13 | --api_base http://localhost:8000/v1 \ 14 | --test_generator 4o-mini \ 15 | --lcb_version release_v2 \ 16 | --num_round ${MAX_ROUND} \ 17 | --result_json_path="results/final_r1qwen7b_n_16_debug_public3_select_tool_assisted_cached_${difficulty}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/final_r1qwen7b_n_16_debug_public3_select_oracle_${difficulty}.json" 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/4o_mini_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --selection generated_tests_majority_no_public_tests \ 15 | --result_json_path="results/majority_4o_mini_n_16_${difficulty}.json" \ 16 | 17 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/o1_mini_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --generator o1-mini \ 15 | --selection generated_tests_majority_no_public_tests \ 16 | --result_json_path="results/majority_o1_mini_n_16_${difficulty}.json" \ 17 | 18 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/qwen0.5b_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --api_name Qwen/Qwen2.5-Coder-0.5B-Instruct \ 15 | --api_base http://localhost:8000/v1 \ 16 | --selection generated_tests_majority_no_public_tests \ 17 | --result_json_path="results/majority_qwen0.5b_n_16_${difficulty}.json" \ 18 | 19 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/qwen1.5b_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --api_name Qwen/Qwen2.5-Coder-1.5B-Instruct \ 15 | --api_base http://localhost:8000/v1 \ 16 | --selection generated_tests_majority_no_public_tests \ 17 | --result_json_path="results/majority_qwen1.5b_n_16_${difficulty}.json" \ 18 | 19 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/qwen14b_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --api_name Qwen/Qwen2.5-Coder-14B-Instruct \ 15 | --api_base http://localhost:8000/v1 \ 16 | --selection generated_tests_majority_no_public_tests \ 17 | --result_json_path="results/majority_qwen14b_n_16_${difficulty}.json" \ 18 | 19 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/qwen32b_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=128 \ 9 | --n=16 \ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --api_name Qwen/Qwen2.5-Coder-32B-Instruct \ 15 | --api_base http://localhost:8000/v1 \ 16 | --selection generated_tests_majority_no_public_tests \ 17 | --result_json_path="results/majority_qwen32b_n_16_${difficulty}.json" \ 18 | 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/qwen3b_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --api_name Qwen/Qwen2.5-Coder-3B-Instruct \ 15 | --api_base http://localhost:8000/v1 \ 16 | --selection generated_tests_majority_no_public_tests \ 17 | --result_json_path="results/majority_qwen3b_n_16_${difficulty}.json" \ 18 | 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/qwen7b_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=16 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --api_name Qwen/Qwen2.5-Coder-7B-Instruct \ 15 | --api_base http://localhost:8000/v1 \ 16 | --selection generated_tests_majority_no_public_tests \ 17 | --result_json_path="results/majority_qwen7b_n_16_${difficulty}.json" \ 18 | 19 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/qwq32b_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy 4 | do 5 | python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=32 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --api_name Qwen/QwQ-32B-Preview \ 15 | --api_base http://localhost:8000/v1 \ 16 | --selection generated_tests_majority_no_public_tests \ 17 | --result_json_path="results/majority_qwq32b_n_16_${difficulty}.json" \ 18 | 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/r1qwen14b_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=32 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 15 | --api_base http://localhost:8000/v1 \ 16 | --selection generated_tests_majority_no_public_tests \ 17 | --result_json_path="results/majority_r1qwen14b_n_16_${difficulty}.json" \ 18 | 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/r1qwen32b_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=32 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \ 15 | --api_base http://localhost:8000/v1 \ 16 | --selection generated_tests_majority_no_public_tests \ 17 | --result_json_path="results/majority_r1qwen32b_n_16_${difficulty}.json" \ 18 | 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/majority_baselines/r1qwen7b_n_16_majority.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | /root/miniconda3/envs/sstar/bin/python evaluate_multiprocess.py \ 6 | --difficulty=${difficulty} \ 7 | --temperature=0.7 \ 8 | --num_threads=32 \ 9 | --n=16 \ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v2 \ 12 | --num_round 1 \ 13 | --no_dspy_gen \ 14 | --api_name deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ 15 | --api_base http://localhost:8000/v1 \ 16 | --selection generated_tests_majority_no_public_tests \ 17 | --result_json_path="results/majority_r1qwen7b_n_16_${difficulty}.json" \ 18 | 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/temp02_4o_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | for n in 1 2 4 8 16 32 64 128 6 | do 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.2 \ 10 | --num_threads=32 \ 11 | --n=${n} \ 12 | --selection=oracle \ 13 | --lcb_version release_v4 \ 14 | --start_date 2024-08-01 \ 15 | --end_date 2024-12-01 \ 16 | --no_refine \ 17 | --num_round 1 \ 18 | --result_json_path="results/sec4_parallel_sample_temp02_4o_mini_${difficulty}_n_${n}.json" 19 | done 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/temp02_qwen7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-7B-Instruct 4 | 5 | for difficulty in easy medium hard 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.2 \ 12 | --num_threads=32 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 21 | --api_base http://localhost:8000/v1 \ 22 | --result_json_path="results/sec4_parallel_sample_temp02_qwen_7b_${difficulty}_n_${n}.json" 23 | done 24 | done 25 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/temp02_qwen_32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-32B-Instruct --tensor-parallel-size 8 4 | 5 | for difficulty in easy medium hard 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.2 \ 12 | --num_threads=32 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 21 | --api_base http://localhost:8000/v1 \ 22 | --result_json_path="results/sec4_parallel_sample_temp02_qwen_32b_${difficulty}_n_${n}.json" 23 | done 24 | done 25 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/temp05_4o_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | for n in 1 2 4 8 16 32 64 128 6 | do 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.5 \ 10 | --num_threads=32 \ 11 | --n=${n} \ 12 | --selection=oracle \ 13 | --lcb_version release_v4 \ 14 | --start_date 2024-08-01 \ 15 | --end_date 2024-12-01 \ 16 | --no_refine \ 17 | --num_round 1 \ 18 | --result_json_path="results/sec4_parallel_sample_temp05_4o_mini_${difficulty}_n_${n}.json" 19 | done 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/temp05_qwen7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-7B-Instruct 4 | 5 | for difficulty in easy medium hard 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.5 \ 12 | --num_threads=32 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 21 | --api_base http://localhost:8000/v1 \ 22 | --result_json_path="results/sec4_parallel_sample_temp05_qwen_7b_${difficulty}_n_${n}.json" 23 | done 24 | done 25 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/temp05_qwen_32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-32B-Instruct --tensor-parallel-size 8 4 | 5 | for difficulty in easy medium hard 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.5 \ 12 | --num_threads=32 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 21 | --api_base http://localhost:8000/v1 \ 22 | --result_json_path="results/sec4_parallel_sample_temp05_qwen_32b_${difficulty}_n_${n}.json" 23 | done 24 | done 25 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/temp09_4o_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | for n in 1 2 4 8 16 32 64 128 6 | do 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.95 \ 10 | --num_threads=32 \ 11 | --n=${n} \ 12 | --selection=oracle \ 13 | --lcb_version release_v4 \ 14 | --start_date 2024-08-01 \ 15 | --end_date 2024-12-01 \ 16 | --no_refine \ 17 | --num_round 1 \ 18 | --result_json_path="results/sec4_parallel_sample_temp09_4o_mini_${difficulty}_n_${n}.json" 19 | done 20 | done 21 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/temp09_qwen7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-7B-Instruct 4 | 5 | for difficulty in easy medium hard 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.95 \ 12 | --num_threads=32 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 21 | --api_base http://localhost:8000/v1 \ 22 | --result_json_path="results/sec4_parallel_sample_temp09_qwen_7b_${difficulty}_n_${n}.json" 23 | done 24 | done 25 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/temp09_qwen_32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-32B-Instruct --tensor-parallel-size 8 4 | 5 | for difficulty in easy medium hard 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.95 \ 12 | --num_threads=32 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 21 | --api_base http://localhost:8000/v1 \ 22 | --result_json_path="results/sec4_parallel_sample_temp09_qwen_32b_${difficulty}_n_${n}.json" 23 | done 24 | done 25 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/vanilla_4o_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for difficulty in easy medium hard 4 | do 5 | for n in 1 2 4 8 16 32 64 128 6 | do 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.7 \ 10 | --num_threads=32 \ 11 | --n=${n} \ 12 | --selection=oracle \ 13 | --lcb_version release_v4 \ 14 | --start_date 2024-08-01 \ 15 | --end_date 2024-12-01 \ 16 | --no_refine \ 17 | --num_round 1 \ 18 | --result_json_path="results/sec4_parallel_sample_vanilla_4o_mini_${difficulty}_n_${n}.json" 19 | done 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/vanilla_qwen_32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-32B-Instruct --tensor-parallel-size 8 4 | 5 | for difficulty in easy medium hard 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.7 \ 12 | --num_threads=32 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 21 | --api_base http://localhost:8000/v1 \ 22 | --result_json_path="results/sec4_parallel_sample_vanilla_qwen_32b_${difficulty}_n_${n}.json" 23 | done 24 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/vanilla_qwen_7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-7B-Instruct 4 | 5 | for difficulty in easy medium hard 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.7 \ 12 | --num_threads=32 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 21 | --api_base http://localhost:8000/v1 \ 22 | --result_json_path="results/sec4_parallel_sample_vanilla_qwen_7b_${difficulty}_n_${n}.json" 23 | done 24 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/vanilla_qwq_32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/QwQ-32B-Preview --tensor-parallel-size 8 4 | 5 | for difficulty in easy medium hard 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.7 \ 12 | --num_threads=32 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name Qwen/QwQ-32B-Preview \ 21 | --api_base http://localhost:8000/v1 \ 22 | --no_dspy_gen \ 23 | --result_json_path="results/sec4_parallel_sample_vanilla_qwq_32b_${difficulty}_n_${n}.json" 24 | done 25 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/vanilla_qwq_32b_hard.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/QwQ-32B-Preview --tensor-parallel-size 8 4 | 5 | for difficulty in hard 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.7 \ 12 | --num_threads=8 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name Qwen/QwQ-32B-Preview \ 21 | --api_base http://localhost:8000/v1 \ 22 | --no_dspy_gen \ 23 | --result_json_path="results/sec4_parallel_sample_vanilla_qwq_32b_${difficulty}_n_${n}.json" 24 | done 25 | done 26 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec4_parallel_sample/vanilla_qwq_32b_medium.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/QwQ-32B-Preview --tensor-parallel-size 8 4 | 5 | for difficulty in medium 6 | do 7 | for n in 1 2 4 8 16 32 64 128 8 | do 9 | python evaluate_multiprocess.py \ 10 | --difficulty=${difficulty} \ 11 | --temperature=0.7 \ 12 | --num_threads=8 \ 13 | --n=${n} \ 14 | --selection=oracle \ 15 | --lcb_version release_v4 \ 16 | --start_date 2024-08-01 \ 17 | --end_date 2024-12-01 \ 18 | --no_refine \ 19 | --num_round 1 \ 20 | --api_name Qwen/QwQ-32B-Preview \ 21 | --api_base http://localhost:8000/v1 \ 22 | --no_dspy_gen \ 23 | --result_json_path="results/sec4_parallel_sample_vanilla_qwq_32b_${difficulty}_n_${n}.json" 24 | done 25 | done 26 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec5_revision/last_4o_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=5 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=8 \ 11 | --selection=oracle \ 12 | --lcb_version release_v4 \ 13 | --start_date 2024-08-01 \ 14 | --end_date 2024-12-01 \ 15 | --context last \ 16 | --num_round ${MAX_ROUND} \ 17 | --selection oracle_all_rounds \ 18 | --result_json_path="results/sec5_revision_last_4o_mini_${difficulty}_max_round_${MAX_ROUND}.json" 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec5_revision/last_qwen_32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-32B-Instruct --tensor-parallel-size 8 4 | MAX_ROUND=5 5 | for difficulty in easy medium hard 6 | do 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.7 \ 10 | --num_threads=16 \ 11 | --n=8 \ 12 | --selection=oracle \ 13 | --lcb_version release_v4 \ 14 | --start_date 2024-08-01 \ 15 | --end_date 2024-12-01 \ 16 | --num_round ${MAX_ROUND} \ 17 | --context last \ 18 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 19 | --api_base http://localhost:8000/v1 \ 20 | --selection oracle_all_rounds \ 21 | --result_json_path="results/sec5_revision_last_qwen_32b_${difficulty}_max_round_${MAX_ROUND}.json" 22 | done 23 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec5_revision/last_qwen_7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-7B-Instruct 4 | MAX_ROUND=5 5 | for difficulty in easy medium hard 6 | do 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.7 \ 10 | --num_threads=16 \ 11 | --n=8 \ 12 | --selection=oracle \ 13 | --lcb_version release_v4 \ 14 | --start_date 2024-08-01 \ 15 | --end_date 2024-12-01 \ 16 | --num_round ${MAX_ROUND} \ 17 | --context last \ 18 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 19 | --api_base http://localhost:8000/v1 \ 20 | --selection oracle_all_rounds \ 21 | --result_json_path="results/sec5_revision_last_qwen_7b_${difficulty}_max_round_${MAX_ROUND}.json" 22 | done 23 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec5_revision/refine_4o_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=5 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=8 \ 11 | --selection=oracle \ 12 | --lcb_version release_v4 \ 13 | --start_date 2024-08-01 \ 14 | --end_date 2024-12-01 \ 15 | --num_round ${MAX_ROUND} \ 16 | --selfdebug_decision refine \ 17 | --selection oracle_all_rounds \ 18 | --result_json_path="results/sec5_revision_refine_4o_mini_${difficulty}_max_round_${MAX_ROUND}.json" 19 | done 20 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec5_revision/refine_qwen_32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-32B-Instruct --tensor-parallel-size 8 4 | MAX_ROUND=5 5 | for difficulty in easy medium hard 6 | do 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.7 \ 10 | --num_threads=16 \ 11 | --n=8 \ 12 | --selection=oracle \ 13 | --lcb_version release_v4 \ 14 | --start_date 2024-08-01 \ 15 | --end_date 2024-12-01 \ 16 | --num_round ${MAX_ROUND} \ 17 | --selfdebug_decision refine \ 18 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 19 | --api_base http://localhost:8000/v1 \ 20 | --selection oracle_all_rounds \ 21 | --result_json_path="results/sec5_revision_refine_qwen_32b_${difficulty}_max_round_${MAX_ROUND}.json" 22 | done 23 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec5_revision/refine_qwen_7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-7B-Instruct 4 | MAX_ROUND=5 5 | for difficulty in easy medium hard 6 | do 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.7 \ 10 | --num_threads=16 \ 11 | --n=8 \ 12 | --selection=oracle \ 13 | --lcb_version release_v4 \ 14 | --start_date 2024-08-01 \ 15 | --end_date 2024-12-01 \ 16 | --num_round ${MAX_ROUND} \ 17 | --selfdebug_decision refine \ 18 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 19 | --api_base http://localhost:8000/v1 \ 20 | --selection oracle_all_rounds \ 21 | --result_json_path="results/sec5_revision_refine_qwen_7b_${difficulty}_max_round_${MAX_ROUND}.json" 22 | done 23 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec5_revision/vanilla_4o_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAX_ROUND=5 4 | for difficulty in easy medium hard 5 | do 6 | python evaluate_multiprocess.py \ 7 | --difficulty=${difficulty} \ 8 | --temperature=0.7 \ 9 | --num_threads=16 \ 10 | --n=8 \ 11 | --selection=oracle \ 12 | --lcb_version release_v4 \ 13 | --start_date 2024-08-01 \ 14 | --end_date 2024-12-01 \ 15 | --num_round ${MAX_ROUND} \ 16 | --selection oracle_all_rounds \ 17 | --result_json_path="results/sec5_revision_vanilla_4o_mini_${difficulty}_max_round_${MAX_ROUND}.json" 18 | done 19 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec5_revision/vanilla_qwen_32b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-32B-Instruct --tensor-parallel-size 8 4 | MAX_ROUND=5 5 | for difficulty in easy medium hard 6 | do 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.7 \ 10 | --num_threads=16 \ 11 | --n=8 \ 12 | --selection=oracle \ 13 | --lcb_version release_v4 \ 14 | --start_date 2024-08-01 \ 15 | --end_date 2024-12-01 \ 16 | --num_round ${MAX_ROUND} \ 17 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 18 | --api_base http://localhost:8000/v1 \ 19 | --selection oracle_all_rounds \ 20 | --result_json_path="results/sec5_revision_vanilla_qwen_32b_${difficulty}_max_round_${MAX_ROUND}.json" 21 | done 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec5_revision/vanilla_qwen_7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server: vllm serve Qwen/Qwen2.5-Coder-7B-Instruct 4 | MAX_ROUND=5 5 | for difficulty in easy medium hard 6 | do 7 | python evaluate_multiprocess.py \ 8 | --difficulty=${difficulty} \ 9 | --temperature=0.7 \ 10 | --num_threads=16 \ 11 | --n=8 \ 12 | --selection=oracle \ 13 | --lcb_version release_v4 \ 14 | --start_date 2024-08-01 \ 15 | --end_date 2024-12-01 \ 16 | --num_round ${MAX_ROUND} \ 17 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 18 | --api_base http://localhost:8000/v1 \ 19 | --selection oracle_all_rounds \ 20 | --result_json_path="results/sec5_revision_vanilla_qwen_7b_${difficulty}_max_round_${MAX_ROUND}.json" 21 | done 22 | -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6/4o_mini_tool_assisted.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=generated_tests_tool_assisted\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --load_cached_preds \ 16 | --result_json_path="results/sec6_4o_mini_tool_assisted_${difficulty}_max_round_${MAX_ROUND}.json" \ 17 | --cached_preds_path="results/sec5_revision_vanilla_4o_mini_${difficulty}_max_round_${MAX_ROUND}.json" 18 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6/4o_mini_vanilla_baseline.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=first\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --load_cached_preds \ 16 | --result_json_path="results/sec6_4o_mini_vanilla_baseline_${difficulty}_max_round_${MAX_ROUND}.json" \ 17 | --cached_preds_path="results/sec5_revision_vanilla_4o_mini_${difficulty}_max_round_${MAX_ROUND}.json" 18 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6/4o_mini_vanilla_with_4omini_generated_and_timeout_test.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=generated_tests\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --load_cached_preds \ 16 | --result_json_path="results/sec6_4o_mini_with_4omini_generated_and_timeout_test_${difficulty}_max_round_${MAX_ROUND}.json" \ 17 | --cached_preds_path="results/sec5_revision_vanilla_4o_mini_${difficulty}_max_round_${MAX_ROUND}.json" 18 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6/qwen_32b_tool_assisted.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=generated_tests_tool_assisted\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 16 | --api_base http://localhost:8000/v1 \ 17 | --result_json_path="results/sec6_qwen32b_tool_assisted_${difficulty}_max_round_${MAX_ROUND}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/sec5_revision_vanilla_qwen_32b_${difficulty}_max_round_${MAX_ROUND}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6/qwen_32b_vanilla_baseline.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=first\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 16 | --api_base http://localhost:8000/v1 \ 17 | --result_json_path="results/sec6_qwen32b_vanilla_baseline_${difficulty}_max_round_${MAX_ROUND}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/sec5_revision_vanilla_qwen_32b_${difficulty}_max_round_${MAX_ROUND}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6/qwen_32b_with4omini_test_and_timeout_vanilla.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=generated_tests\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 16 | --api_base http://localhost:8000/v1 \ 17 | --result_json_path="results/sec6_qwen32b_with_4omini_and_timeout_vanilla_${difficulty}_max_round_${MAX_ROUND}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/sec5_revision_vanilla_qwen_32b_${difficulty}_max_round_${MAX_ROUND}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6/qwen_7b_tool_assisted.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=generated_tests_tool_assisted\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 16 | --api_base http://localhost:8000/v1 \ 17 | --result_json_path="results/sec6_qwen7b_tool_assisted_${difficulty}_max_round_${MAX_ROUND}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/sec5_revision_vanilla_qwen_7b_${difficulty}_max_round_${MAX_ROUND}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6/qwen_7b_vanilla_baseline.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=first\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 16 | --api_base http://localhost:8000/v1 \ 17 | --result_json_path="results/sec6_qwen7b_vanilla_baseline_${difficulty}_max_round_${MAX_ROUND}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/sec5_revision_vanilla_qwen_7b_${difficulty}_max_round_${MAX_ROUND}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6/qwen_7b_with4omini_test_and_timeout_vanilla.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=generated_tests\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 16 | --api_base http://localhost:8000/v1 \ 17 | --result_json_path="results/sec6_qwen7b_with_4omini_and_timeout_vanilla_${difficulty}_max_round_${MAX_ROUND}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/sec5_revision_vanilla_qwen_7b_${difficulty}_max_round_${MAX_ROUND}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6_llm_judge_baseline/4o_mini_llm_judge_baseline.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=generated_tests_aware_llm_judge\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --load_cached_preds \ 16 | --result_json_path="results/sec6_4o_mini_llm_judge_baseline_${difficulty}_max_round_${MAX_ROUND}.json" \ 17 | --cached_preds_path="results/sec5_revision_vanilla_4o_mini_${difficulty}_max_round_${MAX_ROUND}.json" 18 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6_llm_judge_baseline/qwen_32b_llm_judge_baseline.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=generated_tests_aware_llm_judge\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --api_name openai/Qwen/Qwen2.5-Coder-32B-Instruct \ 16 | --api_base http://localhost:8000/v1 \ 17 | --result_json_path="results/sec6_qwen32b_llm_judge_baseline_${difficulty}_max_round_${MAX_ROUND}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/sec5_revision_vanilla_qwen_32b_${difficulty}_max_round_${MAX_ROUND}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6_llm_judge_baseline/qwen_7b_llm_judge_baseline.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=generated_tests_aware_llm_judge\ 10 | --test_generator 4o-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --api_name openai/Qwen/Qwen2.5-Coder-7B-Instruct \ 16 | --api_base http://localhost:8000/v1 \ 17 | --result_json_path="results/sec6_qwen7b_llm_judge_baseline_${difficulty}_max_round_${MAX_ROUND}.json" \ 18 | --load_cached_preds \ 19 | --cached_preds_path="results/sec5_revision_vanilla_qwen_7b_${difficulty}_max_round_${MAX_ROUND}.json" 20 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/scripts/sec6_o1_generated/4o_mini_vanilla_with_o1_generated_and_timeout_test.sh: -------------------------------------------------------------------------------- 1 | MAX_ROUND=5 2 | for difficulty in easy medium hard 3 | do 4 | python evaluate_multiprocess.py \ 5 | --difficulty=${difficulty} \ 6 | --temperature=0.7 \ 7 | --num_threads=16 \ 8 | --n=8 \ 9 | --selection=generated_tests\ 10 | --test_generator o1-mini \ 11 | --lcb_version release_v4 \ 12 | --start_date 2024-08-01 \ 13 | --end_date 2024-12-01 \ 14 | --num_round ${MAX_ROUND} \ 15 | --load_cached_preds \ 16 | --result_json_path="results/sec6_4o_mini_with_o1mini_generated_and_timeout_test_${difficulty}_max_round_${MAX_ROUND}.json" \ 17 | --cached_preds_path="results/sec5_revision_vanilla_4o_mini_${difficulty}_max_round_${MAX_ROUND}.json" 18 | done -------------------------------------------------------------------------------- /skythought/test-time-scaling/util.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | 3 | def post_process_code(code): 4 | code = code.split("")[0] 5 | code = code.replace("```python", "") 6 | code = code.split("```")[0] 7 | code = code.replace("", "") 8 | # print(f"postprocessed code: {code}") 9 | return code 10 | 11 | name_map = { 12 | "4o-mini": 'openai/gpt-4o-mini', 13 | "4o": 'openai/gpt-4o', 14 | "o1-mini": 'openai/o1-mini', 15 | "o1": 'openai/o1-preview', 16 | "o3-mini": 'openai/o3-mini', 17 | "o1-preview": 'openai/o1-preview', 18 | "qwen7b": 'Qwen/Qwen2.5-Coder-7B-Instruct', 19 | "qwen32b": 'Qwen/Qwen2.5-Coder-32B-Instruct', 20 | } 21 | 22 | if os.path.exists("v4_only_medium_correct_codes.json"): 23 | ICL_EXAMPLES = json.load(open("v4_only_medium_correct_codes.json", "r")) 24 | else: 25 | print("No ICL examples available") 26 | ICL_EXAMPLES = {} -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/.dockerignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .git 3 | .github 4 | .venv 5 | cache 6 | data 7 | docker 8 | saves 9 | hf_cache 10 | ms_cache 11 | om_cache 12 | output 13 | .dockerignore 14 | .gitattributes 15 | .gitignore 16 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/.env.local: -------------------------------------------------------------------------------- 1 | # Note: actually we do not support .env, just for reference 2 | # api 3 | API_HOST= 4 | API_PORT= 5 | API_KEY= 6 | API_MODEL_NAME= 7 | FASTAPI_ROOT_PATH= 8 | MAX_CONCURRENT= 9 | # general 10 | DISABLE_VERSION_CHECK= 11 | FORCE_CHECK_IMPORTS= 12 | LLAMAFACTORY_VERBOSITY= 13 | USE_MODELSCOPE_HUB= 14 | USE_OPENMIND_HUB= 15 | RECORD_VRAM= 16 | # torchrun 17 | FORCE_TORCHRUN= 18 | MASTER_ADDR= 19 | MASTER_PORT= 20 | NNODES= 21 | NODE_RANK= 22 | NPROC_PER_NODE= 23 | # wandb 24 | WANDB_DISABLED= 25 | WANDB_PROJECT= 26 | WANDB_API_KEY= 27 | # gradio ui 28 | GRADIO_SHARE= 29 | GRADIO_SERVER_NAME= 30 | GRADIO_SERVER_PORT= 31 | GRADIO_ROOT_PATH= 32 | GRADIO_IPV6= 33 | # setup 34 | ENABLE_SHORT_CONSOLE=1 35 | # reserved (do not use) 36 | LLAMABOARD_ENABLED= 37 | LLAMABOARD_WORKDIR= 38 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # What does this PR do? 2 | 3 | Fixes # (issue) 4 | 5 | ## Before submitting 6 | 7 | - [ ] Did you read the [contributor guideline](https://github.com/hiyouga/LLaMA-Factory/blob/main/.github/CONTRIBUTING.md)? 8 | - [ ] Did you write any new necessary tests? 9 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/.github/SECURITY.md: -------------------------------------------------------------------------------- 1 | # Reporting Security Issues 2 | 3 | To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/hiyouga/LLaMA-Factory/security/advisories/new) tab. 4 | 5 | We will send a response indicating the next steps in handling your report. After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance. 6 | 7 | Report security bugs in third-party modules to the person or team maintaining the module. 8 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: check-ast 6 | - id: check-added-large-files 7 | args: ['--maxkb=25000'] 8 | - id: check-merge-conflict 9 | - id: check-yaml 10 | - id: debug-statements 11 | - id: end-of-file-fixer 12 | - id: trailing-whitespace 13 | args: [--markdown-linebreak-ext=md] 14 | - id: no-commit-to-branch 15 | args: ['--branch', 'main'] 16 | 17 | - repo: https://github.com/asottile/pyupgrade 18 | rev: v3.17.0 19 | hooks: 20 | - id: pyupgrade 21 | args: [--py38-plus] 22 | 23 | - repo: https://github.com/astral-sh/ruff-pre-commit 24 | rev: v0.6.9 25 | hooks: 26 | - id: ruff 27 | args: [--fix] 28 | - id: ruff-format 29 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE requirements.txt 2 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build commit quality style test 2 | 3 | check_dirs := scripts src tests setup.py 4 | 5 | build: 6 | pip install build && python -m build 7 | 8 | commit: 9 | pre-commit install 10 | pre-commit run --all-files 11 | 12 | quality: 13 | ruff check $(check_dirs) 14 | ruff format --check $(check_dirs) 15 | 16 | style: 17 | ruff check $(check_dirs) --fix 18 | ruff format $(check_dirs) 19 | 20 | test: 21 | CUDA_VISIBLE_DEVICES= WANDB_DISABLED=true pytest -vv tests/ 22 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/assets/logo.png -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/assets/wechat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/assets/wechat.jpg -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/assets/wechat_npu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/assets/wechat_npu.jpg -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/data/mllm_demo_data/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/data/mllm_demo_data/1.jpg -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/data/mllm_demo_data/1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/data/mllm_demo_data/1.mp4 -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/data/mllm_demo_data/2.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/data/mllm_demo_data/2.avi -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/data/mllm_demo_data/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/data/mllm_demo_data/2.jpg -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/data/mllm_demo_data/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/data/mllm_demo_data/3.jpg -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/data/mllm_demo_data/3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/data/mllm_demo_data/3.mp4 -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/evaluation/ceval/ceval.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/evaluation/ceval/ceval.zip -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/evaluation/cmmlu/cmmlu.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/evaluation/cmmlu/cmmlu.zip -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/evaluation/mmlu/mmlu.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/evaluation/mmlu/mmlu.zip -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/accelerate/fsdp_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | fsdp_config: 6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 7 | fsdp_backward_prefetch: BACKWARD_PRE 8 | fsdp_forward_prefetch: false 9 | fsdp_cpu_ram_efficient_loading: true 10 | fsdp_offload_params: true # offload may affect training speed 11 | fsdp_sharding_strategy: FULL_SHARD 12 | fsdp_state_dict_type: FULL_STATE_DICT 13 | fsdp_sync_module_states: true 14 | fsdp_use_orig_params: true 15 | machine_rank: 0 16 | main_training_function: main 17 | mixed_precision: fp16 # or bf16 18 | num_machines: 1 # the number of nodes 19 | num_processes: 2 # the number of GPUs in all nodes 20 | rdzv_backend: static 21 | same_network: true 22 | tpu_env: [] 23 | tpu_use_cluster: false 24 | tpu_use_sudo: false 25 | use_cpu: false 26 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/deepspeed/ds_z0_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_accumulation_steps": "auto", 5 | "gradient_clipping": "auto", 6 | "zero_allow_untested_optimizer": true, 7 | "fp16": { 8 | "enabled": "auto", 9 | "loss_scale": 0, 10 | "loss_scale_window": 1000, 11 | "initial_scale_power": 16, 12 | "hysteresis": 2, 13 | "min_loss_scale": 1 14 | }, 15 | "bf16": { 16 | "enabled": "auto" 17 | }, 18 | "zero_optimization": { 19 | "stage": 0, 20 | "allgather_partitions": true, 21 | "allgather_bucket_size": 5e8, 22 | "overlap_comm": true, 23 | "reduce_scatter": true, 24 | "reduce_bucket_size": 5e8, 25 | "contiguous_gradients": true, 26 | "round_robin_gradients": true 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/deepspeed/ds_z2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_accumulation_steps": "auto", 5 | "gradient_clipping": "auto", 6 | "zero_allow_untested_optimizer": true, 7 | "fp16": { 8 | "enabled": "auto", 9 | "loss_scale": 0, 10 | "loss_scale_window": 1000, 11 | "initial_scale_power": 16, 12 | "hysteresis": 2, 13 | "min_loss_scale": 1 14 | }, 15 | "bf16": { 16 | "enabled": "auto" 17 | }, 18 | "zero_optimization": { 19 | "stage": 2, 20 | "allgather_partitions": true, 21 | "allgather_bucket_size": 5e8, 22 | "overlap_comm": true, 23 | "reduce_scatter": true, 24 | "reduce_bucket_size": 5e8, 25 | "contiguous_gradients": true, 26 | "round_robin_gradients": true 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/extras/adam_mini/qwen2_full_sft.yaml: -------------------------------------------------------------------------------- 1 | ### model 2 | model_name_or_path: Qwen/Qwen2-1.5B-Instruct 3 | 4 | ### method 5 | stage: sft 6 | do_train: true 7 | finetuning_type: full 8 | use_adam_mini: true 9 | 10 | ### dataset 11 | dataset: identity,alpaca_en_demo 12 | template: qwen 13 | cutoff_len: 2048 14 | max_samples: 1000 15 | overwrite_cache: true 16 | preprocessing_num_workers: 16 17 | 18 | ### output 19 | output_dir: saves/qwen2-1_5b/full/sft 20 | logging_steps: 10 21 | save_steps: 500 22 | plot_loss: true 23 | overwrite_output_dir: true 24 | 25 | ### train 26 | per_device_train_batch_size: 1 27 | gradient_accumulation_steps: 8 28 | learning_rate: 1.0e-5 29 | num_train_epochs: 3.0 30 | lr_scheduler_type: cosine 31 | warmup_ratio: 0.1 32 | bf16: true 33 | ddp_timeout: 180000000 34 | 35 | ### eval 36 | val_size: 0.1 37 | per_device_eval_batch_size: 1 38 | eval_strategy: steps 39 | eval_steps: 500 40 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/extras/fsdp_qlora/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # DO NOT use GPTQ/AWQ model in FSDP+QLoRA 3 | 4 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch \ 5 | --config_file examples/accelerate/fsdp_config.yaml \ 6 | src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml 7 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/extras/llama_pro/expand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python scripts/llama_pro.py \ 4 | --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \ 5 | --output_dir models/llama3-8b-pro \ 6 | --num_expand 8 7 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/extras/nlg_eval/llama3_lora_predict.yaml: -------------------------------------------------------------------------------- 1 | # The batch generation can be SLOW using this config. 2 | # For faster inference, we recommend to use `scripts/vllm_infer.py`. 3 | 4 | ### model 5 | model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct 6 | adapter_name_or_path: saves/llama3-8b/lora/sft 7 | 8 | ### method 9 | stage: sft 10 | do_predict: true 11 | finetuning_type: lora 12 | 13 | ### dataset 14 | eval_dataset: identity,alpaca_en_demo 15 | template: llama3 16 | cutoff_len: 2048 17 | max_samples: 50 18 | overwrite_cache: true 19 | preprocessing_num_workers: 16 20 | 21 | ### output 22 | output_dir: saves/llama3-8b/lora/predict 23 | overwrite_output_dir: true 24 | 25 | ### eval 26 | per_device_eval_batch_size: 1 27 | predict_with_generate: true 28 | ddp_timeout: 180000000 29 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/extras/pissa/init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python scripts/pissa_init.py \ 4 | --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \ 5 | --output_dir models/llama3-8b-pissa 6 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/inference/llama3.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct 2 | template: llama3 3 | infer_backend: huggingface # choices: [huggingface, vllm] 4 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/inference/llama3_lora_sft.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct 2 | adapter_name_or_path: saves/llama3-8b/lora/sft 3 | template: llama3 4 | finetuning_type: lora 5 | infer_backend: huggingface # choices: [huggingface, vllm] 6 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/inference/llama3_vllm.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct 2 | template: llama3 3 | infer_backend: vllm 4 | vllm_enforce_eager: true 5 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/inference/llava1_5.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: llava-hf/llava-1.5-7b-hf 2 | template: llava 3 | infer_backend: huggingface # choices: [huggingface, vllm] 4 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/inference/qwen2_vl.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: Qwen/Qwen2-VL-7B-Instruct 2 | template: qwen2_vl 3 | infer_backend: huggingface # choices: [huggingface, vllm] 4 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/merge_lora/llama3_gptq.yaml: -------------------------------------------------------------------------------- 1 | ### model 2 | model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct 3 | template: llama3 4 | 5 | ### export 6 | export_dir: models/llama3_gptq 7 | export_quantization_bit: 4 8 | export_quantization_dataset: data/c4_demo.json 9 | export_size: 2 10 | export_device: cpu 11 | export_legacy_format: false 12 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/merge_lora/llama3_lora_sft.yaml: -------------------------------------------------------------------------------- 1 | ### Note: DO NOT use quantized model or quantization_bit when merging lora adapters 2 | 3 | ### model 4 | model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct 5 | adapter_name_or_path: saves/llama3-8b/lora/sft 6 | template: llama3 7 | finetuning_type: lora 8 | 9 | ### export 10 | export_dir: models/llama3_lora_sft 11 | export_size: 2 12 | export_device: cpu 13 | export_legacy_format: false 14 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/merge_lora/qwen2vl_lora_sft.yaml: -------------------------------------------------------------------------------- 1 | ### Note: DO NOT use quantized model or quantization_bit when merging lora adapters 2 | 3 | ### model 4 | model_name_or_path: Qwen/Qwen2-VL-7B-Instruct 5 | adapter_name_or_path: saves/qwen2_vl-7b/lora/sft 6 | template: qwen2_vl 7 | finetuning_type: lora 8 | 9 | ### export 10 | export_dir: models/qwen2_vl_lora_sft 11 | export_size: 2 12 | export_device: cpu 13 | export_legacy_format: false 14 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/train_lora/llama3_lora_eval.yaml: -------------------------------------------------------------------------------- 1 | ### model 2 | model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct 3 | adapter_name_or_path: saves/llama3-8b/lora/sft 4 | 5 | ### method 6 | finetuning_type: lora 7 | 8 | ### dataset 9 | task: mmlu_test # choices: [mmlu_test, ceval_validation, cmmlu_test] 10 | template: fewshot 11 | lang: en 12 | n_shot: 5 13 | 14 | ### output 15 | save_dir: saves/llama3-8b/lora/eval 16 | 17 | ### eval 18 | batch_size: 4 19 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/train_lora/llama3_lora_pretrain.yaml: -------------------------------------------------------------------------------- 1 | ### model 2 | model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct 3 | 4 | ### method 5 | stage: pt 6 | do_train: true 7 | finetuning_type: lora 8 | lora_target: all 9 | 10 | ### dataset 11 | dataset: c4_demo 12 | cutoff_len: 2048 13 | max_samples: 1000 14 | overwrite_cache: true 15 | preprocessing_num_workers: 16 16 | 17 | ### output 18 | output_dir: saves/llama3-8b/lora/pretrain 19 | logging_steps: 10 20 | save_steps: 500 21 | plot_loss: true 22 | overwrite_output_dir: true 23 | 24 | ### train 25 | per_device_train_batch_size: 1 26 | gradient_accumulation_steps: 8 27 | learning_rate: 1.0e-4 28 | num_train_epochs: 3.0 29 | lr_scheduler_type: cosine 30 | warmup_ratio: 0.1 31 | bf16: true 32 | ddp_timeout: 180000000 33 | 34 | ### eval 35 | val_size: 0.1 36 | per_device_eval_batch_size: 1 37 | eval_strategy: steps 38 | eval_steps: 500 39 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/train_lora/llama3_lora_reward.yaml: -------------------------------------------------------------------------------- 1 | ### model 2 | model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct 3 | 4 | ### method 5 | stage: rm 6 | do_train: true 7 | finetuning_type: lora 8 | lora_target: all 9 | 10 | ### dataset 11 | dataset: dpo_en_demo 12 | template: llama3 13 | cutoff_len: 2048 14 | max_samples: 1000 15 | overwrite_cache: true 16 | preprocessing_num_workers: 16 17 | 18 | ### output 19 | output_dir: saves/llama3-8b/lora/reward 20 | logging_steps: 10 21 | save_steps: 500 22 | plot_loss: true 23 | overwrite_output_dir: true 24 | 25 | ### train 26 | per_device_train_batch_size: 1 27 | gradient_accumulation_steps: 8 28 | learning_rate: 1.0e-4 29 | num_train_epochs: 3.0 30 | lr_scheduler_type: cosine 31 | warmup_ratio: 0.1 32 | bf16: true 33 | ddp_timeout: 180000000 34 | 35 | ### eval 36 | val_size: 0.1 37 | per_device_eval_batch_size: 1 38 | eval_strategy: steps 39 | eval_steps: 500 40 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/train_lora/llama3_preprocess.yaml: -------------------------------------------------------------------------------- 1 | ### model 2 | model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct 3 | 4 | ### method 5 | stage: sft 6 | do_train: true 7 | finetuning_type: lora 8 | lora_target: all 9 | 10 | ### dataset 11 | dataset: identity,alpaca_en_demo 12 | template: llama3 13 | cutoff_len: 2048 14 | max_samples: 1000 15 | overwrite_cache: true 16 | preprocessing_num_workers: 16 17 | tokenized_path: saves/llama3-8b/dataset/sft 18 | 19 | ### output 20 | output_dir: saves/llama3-8b/lora/sft 21 | overwrite_output_dir: true 22 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/examples/train_lora/llava1_5_lora_sft.yaml: -------------------------------------------------------------------------------- 1 | ### model 2 | model_name_or_path: llava-hf/llava-1.5-7b-hf 3 | 4 | ### method 5 | stage: sft 6 | do_train: true 7 | finetuning_type: lora 8 | lora_target: all 9 | 10 | ### dataset 11 | dataset: mllm_demo 12 | template: llava 13 | cutoff_len: 2048 14 | max_samples: 1000 15 | overwrite_cache: true 16 | preprocessing_num_workers: 16 17 | 18 | ### output 19 | output_dir: saves/llava1_5-7b/lora/sft 20 | logging_steps: 10 21 | save_steps: 500 22 | plot_loss: true 23 | overwrite_output_dir: true 24 | 25 | ### train 26 | per_device_train_batch_size: 1 27 | gradient_accumulation_steps: 8 28 | learning_rate: 1.0e-4 29 | num_train_epochs: 3.0 30 | lr_scheduler_type: cosine 31 | warmup_ratio: 0.1 32 | bf16: true 33 | ddp_timeout: 180000000 34 | 35 | ### eval 36 | val_size: 0.1 37 | per_device_eval_batch_size: 1 38 | eval_strategy: steps 39 | eval_steps: 500 40 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.ruff] 6 | target-version = "py38" 7 | line-length = 119 8 | indent-width = 4 9 | 10 | [tool.ruff.lint] 11 | ignore = ["C408", "C901", "E501", "E731", "E741", "W605"] 12 | select = ["C", "E", "F", "I", "W"] 13 | 14 | [tool.ruff.lint.isort] 15 | lines-after-imports = 2 16 | known-first-party = ["llamafactory"] 17 | known-third-party = [ 18 | "accelerate", 19 | "datasets", 20 | "gradio", 21 | "numpy", 22 | "peft", 23 | "torch", 24 | "transformers", 25 | "trl" 26 | ] 27 | 28 | [tool.ruff.format] 29 | quote-style = "double" 30 | indent-style = "space" 31 | docstring-code-format = true 32 | skip-magic-trailing-comma = false 33 | line-ending = "auto" 34 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.41.2,<=4.46.1 2 | datasets>=2.16.0,<=3.1.0 3 | accelerate>=0.34.0,<=1.0.1 4 | peft>=0.11.1,<=0.12.0 5 | trl>=0.8.6,<=0.9.6 6 | tokenizers>=0.19.0,<0.20.4 7 | gradio>=4.0.0,<5.0.0 8 | pandas>=2.0.0 9 | scipy 10 | einops 11 | sentencepiece 12 | tiktoken 13 | protobuf 14 | uvicorn 15 | pydantic 16 | fastapi 17 | sse-starlette 18 | matplotlib>=3.7.0 19 | fire 20 | packaging 21 | pyyaml 22 | numpy<2.0.0 23 | av 24 | tyro<0.9.0 25 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/src/llamafactory/api/__init__.py -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/chat/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 the LlamaFactory team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .base_engine import BaseEngine 16 | from .chat_model import ChatModel 17 | 18 | 19 | __all__ = ["BaseEngine", "ChatModel"] 20 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/data/processors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/src/llamafactory/data/processors/__init__.py -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/src/llamafactory/eval/__init__.py -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/extras/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/src/llamafactory/extras/__init__.py -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/launcher.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 the LlamaFactory team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from llamafactory.train.tuner import run_exp # use absolute import 16 | 17 | 18 | def launch(): 19 | run_exp() 20 | 21 | 22 | if __name__ == "__main__": 23 | launch() 24 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/model/model_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/src/llamafactory/model/model_utils/__init__.py -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/src/llamafactory/train/__init__.py -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/train/dpo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 the LlamaFactory team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .workflow import run_dpo 16 | 17 | 18 | __all__ = ["run_dpo"] 19 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/train/kto/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 the LlamaFactory team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .workflow import run_kto 16 | 17 | 18 | __all__ = ["run_kto"] 19 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/train/ppo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 the LlamaFactory team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .workflow import run_ppo 16 | 17 | 18 | __all__ = ["run_ppo"] 19 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/train/pt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 the LlamaFactory team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .workflow import run_pt 16 | 17 | 18 | __all__ = ["run_pt"] 19 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/train/rm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 the LlamaFactory team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .workflow import run_rm 16 | 17 | 18 | __all__ = ["run_rm"] 19 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/train/sft/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 the LlamaFactory team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .workflow import run_sft 16 | 17 | 18 | __all__ = ["run_sft"] 19 | -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/llamafactory/webui/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/skythought/train/LLaMA-Factory/src/llamafactory/webui/__init__.py -------------------------------------------------------------------------------- /skythought/train/LLaMA-Factory/src/train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 the LlamaFactory team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from llamafactory.train.tuner import run_exp 16 | 17 | 18 | def main(): 19 | run_exp() 20 | 21 | 22 | def _mp_fn(index): 23 | # For xla_spawn (TPUs) 24 | run_exp() 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /skythought/train/README.md: -------------------------------------------------------------------------------- 1 | ## Training 2 | We use a fork from [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) to perform training. 3 | 4 | Step 1: Please add the data path produced by the tools directory or the one we provide, to the file_name field of Sky-T1 entry in [LLaMA-Factory/data/dataset_info.json](./LLaMA-Factory/data/dataset_info.json). 5 | 6 | Step 2: run 7 | 8 | `FORCE_TORCHRUN=1 NNODES=1 NODE_RANK=0 MASTER_PORT=29501 llamafactory-cli train examples/train_full/qwen2_full_sft.yaml` 9 | 10 | to train from a 32B model on 8 H100 GPUs. Interested readers can refer to the detailed settings in [examples/train_full/qwen2_full_sft.yaml](./LLaMA-Factory/examples/train_full/qwen2_full_sft.yaml). 11 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/tests/__init__.py -------------------------------------------------------------------------------- /tests/evals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/tests/evals/__init__.py -------------------------------------------------------------------------------- /tests/evals/scoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NovaSky-AI/SkyThought/cf3b941afcd439cde56f47789579822398a7789d/tests/evals/scoring/__init__.py --------------------------------------------------------------------------------