├── IFD ├── 1-pre_experience_analysis.sh ├── 2-train.sh ├── 3-analysis.sh ├── 4-final_selection.sh ├── cherry_seletion │ ├── data_analysis.py │ ├── data_analysis_pre.py │ ├── data_by_IFD.py │ ├── data_by_IFD_pre.py │ ├── data_by_IFD_vic.py │ ├── data_by_cluster.py │ ├── data_merge.py │ ├── filter.py │ └── select_from_ifd_json.py └── data_merge.py ├── LESS ├── 1-warmup_lora_train.sh ├── 2-grads_subsource_llama3.sh ├── 3-val_data_grads.sh ├── 4-matching.sh ├── 5-write.sh ├── less │ ├── .gitignore │ ├── analysis │ │ ├── llama-2-13b-hf_loss.pdf │ │ ├── llama-2-13b-hf_loss_acc.pdf │ │ ├── llama-2-7b-hf_loss.pdf │ │ ├── llama-2-7b-hf_loss_acc.pdf │ │ ├── loss.ipynb │ │ ├── mistral-7b_loss.pdf │ │ └── mistral-7b_loss_acc.pdf │ ├── data_selection │ │ ├── collect_grad_reps.py │ │ ├── get_info.py │ │ ├── get_test_dataset.py │ │ ├── get_training_dataset.py │ │ ├── get_validation_dataset.py │ │ ├── matching.py │ │ ├── matching_icl.py │ │ ├── matching_raw.py │ │ ├── selected_data.py │ │ ├── selected_data_icl.py │ │ └── write_selected_data.py │ ├── scripts │ │ ├── analysis │ │ │ └── analysis.sh │ │ ├── data_selection │ │ │ └── matching.sh │ │ ├── get_info │ │ │ ├── grad │ │ │ │ ├── get_eval_lora_grads.sh │ │ │ │ └── get_train_lora_grads.sh │ │ │ ├── loss │ │ │ │ ├── get_eval_lora_loss.sh │ │ │ │ ├── get_eval_pretrain_loss.sh │ │ │ │ └── get_loss.sh │ │ │ └── rep │ │ │ │ └── get_eval_lora_reps.sh │ │ └── train │ │ │ ├── base_training_args.sh │ │ │ ├── lora_train.sh │ │ │ └── warmup_lora_train.sh │ └── train │ │ ├── data_arguments.py │ │ ├── model_arguments.py │ │ ├── train.py │ │ └── training_arguments.py └── llama3.sh ├── README.md ├── SelectIT ├── data │ └── rating_prompt.txt ├── data_merge.py ├── self_reflection │ └── sentence_level.py ├── sentence_llama.sh └── sentence_qwen.sh ├── cross_entropy ├── entropy.sh └── entropy_response.py ├── diverse ├── configs │ ├── config_debug.yml │ ├── config_openhermes_llama3.yml │ ├── config_openhermes_qwen2.yml │ ├── config_wildchat_llama3.yml │ └── config_wildchat_qwen2.yml ├── consts.py ├── ds_z3_config.json ├── eval.py ├── evol_schedule_base.py ├── evol_schedules │ ├── __init__.py │ └── kcenter_sampling.py ├── llama3_openhermes.sh ├── llama3_wildchat.sh ├── qwen2_openhermes.sh ├── qwen2_wildchat.sh ├── templates │ ├── README.md │ ├── alpaca.json │ ├── alpaca_legacy.json │ ├── alpaca_short.json │ └── vigogne.json ├── train.py └── utils.py ├── llama-factory-train.sh ├── requirements.txt ├── token_length ├── compute_token_num.py ├── compute_token_qwen.sh ├── embedding.py └── kmeans_sample.py └── zip ├── ZIP.py └── zip.sh /IFD/1-pre_experience_analysis.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/1-pre_experience_analysis.sh -------------------------------------------------------------------------------- /IFD/2-train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/2-train.sh -------------------------------------------------------------------------------- /IFD/3-analysis.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/3-analysis.sh -------------------------------------------------------------------------------- /IFD/4-final_selection.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/4-final_selection.sh -------------------------------------------------------------------------------- /IFD/cherry_seletion/data_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/cherry_seletion/data_analysis.py -------------------------------------------------------------------------------- /IFD/cherry_seletion/data_analysis_pre.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/cherry_seletion/data_analysis_pre.py -------------------------------------------------------------------------------- /IFD/cherry_seletion/data_by_IFD.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/cherry_seletion/data_by_IFD.py -------------------------------------------------------------------------------- /IFD/cherry_seletion/data_by_IFD_pre.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/cherry_seletion/data_by_IFD_pre.py -------------------------------------------------------------------------------- /IFD/cherry_seletion/data_by_IFD_vic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/cherry_seletion/data_by_IFD_vic.py -------------------------------------------------------------------------------- /IFD/cherry_seletion/data_by_cluster.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/cherry_seletion/data_by_cluster.py -------------------------------------------------------------------------------- /IFD/cherry_seletion/data_merge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/cherry_seletion/data_merge.py -------------------------------------------------------------------------------- /IFD/cherry_seletion/filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/cherry_seletion/filter.py -------------------------------------------------------------------------------- /IFD/cherry_seletion/select_from_ifd_json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/cherry_seletion/select_from_ifd_json.py -------------------------------------------------------------------------------- /IFD/data_merge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/IFD/data_merge.py -------------------------------------------------------------------------------- /LESS/1-warmup_lora_train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/1-warmup_lora_train.sh -------------------------------------------------------------------------------- /LESS/2-grads_subsource_llama3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/2-grads_subsource_llama3.sh -------------------------------------------------------------------------------- /LESS/3-val_data_grads.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/3-val_data_grads.sh -------------------------------------------------------------------------------- /LESS/4-matching.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/4-matching.sh -------------------------------------------------------------------------------- /LESS/5-write.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/5-write.sh -------------------------------------------------------------------------------- /LESS/less/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/.gitignore -------------------------------------------------------------------------------- /LESS/less/analysis/llama-2-13b-hf_loss.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/analysis/llama-2-13b-hf_loss.pdf -------------------------------------------------------------------------------- /LESS/less/analysis/llama-2-13b-hf_loss_acc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/analysis/llama-2-13b-hf_loss_acc.pdf -------------------------------------------------------------------------------- /LESS/less/analysis/llama-2-7b-hf_loss.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/analysis/llama-2-7b-hf_loss.pdf -------------------------------------------------------------------------------- /LESS/less/analysis/llama-2-7b-hf_loss_acc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/analysis/llama-2-7b-hf_loss_acc.pdf -------------------------------------------------------------------------------- /LESS/less/analysis/loss.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/analysis/loss.ipynb -------------------------------------------------------------------------------- /LESS/less/analysis/mistral-7b_loss.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/analysis/mistral-7b_loss.pdf -------------------------------------------------------------------------------- /LESS/less/analysis/mistral-7b_loss_acc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/analysis/mistral-7b_loss_acc.pdf -------------------------------------------------------------------------------- /LESS/less/data_selection/collect_grad_reps.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/collect_grad_reps.py -------------------------------------------------------------------------------- /LESS/less/data_selection/get_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/get_info.py -------------------------------------------------------------------------------- /LESS/less/data_selection/get_test_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/get_test_dataset.py -------------------------------------------------------------------------------- /LESS/less/data_selection/get_training_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/get_training_dataset.py -------------------------------------------------------------------------------- /LESS/less/data_selection/get_validation_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/get_validation_dataset.py -------------------------------------------------------------------------------- /LESS/less/data_selection/matching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/matching.py -------------------------------------------------------------------------------- /LESS/less/data_selection/matching_icl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/matching_icl.py -------------------------------------------------------------------------------- /LESS/less/data_selection/matching_raw.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/matching_raw.py -------------------------------------------------------------------------------- /LESS/less/data_selection/selected_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/selected_data.py -------------------------------------------------------------------------------- /LESS/less/data_selection/selected_data_icl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/selected_data_icl.py -------------------------------------------------------------------------------- /LESS/less/data_selection/write_selected_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/data_selection/write_selected_data.py -------------------------------------------------------------------------------- /LESS/less/scripts/analysis/analysis.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/scripts/analysis/analysis.sh -------------------------------------------------------------------------------- /LESS/less/scripts/data_selection/matching.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/scripts/data_selection/matching.sh -------------------------------------------------------------------------------- /LESS/less/scripts/get_info/grad/get_eval_lora_grads.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/scripts/get_info/grad/get_eval_lora_grads.sh -------------------------------------------------------------------------------- /LESS/less/scripts/get_info/grad/get_train_lora_grads.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/scripts/get_info/grad/get_train_lora_grads.sh -------------------------------------------------------------------------------- /LESS/less/scripts/get_info/loss/get_eval_lora_loss.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/scripts/get_info/loss/get_eval_lora_loss.sh -------------------------------------------------------------------------------- /LESS/less/scripts/get_info/loss/get_eval_pretrain_loss.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/scripts/get_info/loss/get_eval_pretrain_loss.sh -------------------------------------------------------------------------------- /LESS/less/scripts/get_info/loss/get_loss.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/scripts/get_info/loss/get_loss.sh -------------------------------------------------------------------------------- /LESS/less/scripts/get_info/rep/get_eval_lora_reps.sh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LESS/less/scripts/train/base_training_args.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/scripts/train/base_training_args.sh -------------------------------------------------------------------------------- /LESS/less/scripts/train/lora_train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/scripts/train/lora_train.sh -------------------------------------------------------------------------------- /LESS/less/scripts/train/warmup_lora_train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/scripts/train/warmup_lora_train.sh -------------------------------------------------------------------------------- /LESS/less/train/data_arguments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/train/data_arguments.py -------------------------------------------------------------------------------- /LESS/less/train/model_arguments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/train/model_arguments.py -------------------------------------------------------------------------------- /LESS/less/train/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/train/train.py -------------------------------------------------------------------------------- /LESS/less/train/training_arguments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/less/train/training_arguments.py -------------------------------------------------------------------------------- /LESS/llama3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/LESS/llama3.sh -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/README.md -------------------------------------------------------------------------------- /SelectIT/data/rating_prompt.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/SelectIT/data/rating_prompt.txt -------------------------------------------------------------------------------- /SelectIT/data_merge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/SelectIT/data_merge.py -------------------------------------------------------------------------------- /SelectIT/self_reflection/sentence_level.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/SelectIT/self_reflection/sentence_level.py -------------------------------------------------------------------------------- /SelectIT/sentence_llama.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/SelectIT/sentence_llama.sh -------------------------------------------------------------------------------- /SelectIT/sentence_qwen.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/SelectIT/sentence_qwen.sh -------------------------------------------------------------------------------- /cross_entropy/entropy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/cross_entropy/entropy.sh -------------------------------------------------------------------------------- /cross_entropy/entropy_response.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/cross_entropy/entropy_response.py -------------------------------------------------------------------------------- /diverse/configs/config_debug.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/configs/config_debug.yml -------------------------------------------------------------------------------- /diverse/configs/config_openhermes_llama3.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/configs/config_openhermes_llama3.yml -------------------------------------------------------------------------------- /diverse/configs/config_openhermes_qwen2.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/configs/config_openhermes_qwen2.yml -------------------------------------------------------------------------------- /diverse/configs/config_wildchat_llama3.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/configs/config_wildchat_llama3.yml -------------------------------------------------------------------------------- /diverse/configs/config_wildchat_qwen2.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/configs/config_wildchat_qwen2.yml -------------------------------------------------------------------------------- /diverse/consts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/consts.py -------------------------------------------------------------------------------- /diverse/ds_z3_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/ds_z3_config.json -------------------------------------------------------------------------------- /diverse/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/eval.py -------------------------------------------------------------------------------- /diverse/evol_schedule_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/evol_schedule_base.py -------------------------------------------------------------------------------- /diverse/evol_schedules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/evol_schedules/__init__.py -------------------------------------------------------------------------------- /diverse/evol_schedules/kcenter_sampling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/evol_schedules/kcenter_sampling.py -------------------------------------------------------------------------------- /diverse/llama3_openhermes.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/llama3_openhermes.sh -------------------------------------------------------------------------------- /diverse/llama3_wildchat.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/llama3_wildchat.sh -------------------------------------------------------------------------------- /diverse/qwen2_openhermes.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/qwen2_openhermes.sh -------------------------------------------------------------------------------- /diverse/qwen2_wildchat.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/qwen2_wildchat.sh -------------------------------------------------------------------------------- /diverse/templates/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/templates/README.md -------------------------------------------------------------------------------- /diverse/templates/alpaca.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/templates/alpaca.json -------------------------------------------------------------------------------- /diverse/templates/alpaca_legacy.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/templates/alpaca_legacy.json -------------------------------------------------------------------------------- /diverse/templates/alpaca_short.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/templates/alpaca_short.json -------------------------------------------------------------------------------- /diverse/templates/vigogne.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/templates/vigogne.json -------------------------------------------------------------------------------- /diverse/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/train.py -------------------------------------------------------------------------------- /diverse/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/diverse/utils.py -------------------------------------------------------------------------------- /llama-factory-train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/llama-factory-train.sh -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/requirements.txt -------------------------------------------------------------------------------- /token_length/compute_token_num.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/token_length/compute_token_num.py -------------------------------------------------------------------------------- /token_length/compute_token_qwen.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/token_length/compute_token_qwen.sh -------------------------------------------------------------------------------- /token_length/embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/token_length/embedding.py -------------------------------------------------------------------------------- /token_length/kmeans_sample.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/token_length/kmeans_sample.py -------------------------------------------------------------------------------- /zip/ZIP.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/zip/ZIP.py -------------------------------------------------------------------------------- /zip/zip.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiatingyu/SFT-DataSelection-at-scale/HEAD/zip/zip.sh --------------------------------------------------------------------------------