├── .amltignore ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── data_ordering ├── config │ ├── folding.yaml │ ├── shuffle.yaml │ └── sorting.yaml ├── entry.py ├── entry.sh ├── folding.py ├── shuffle.py └── sorting.py ├── data_scoring ├── config │ ├── deepspeed_lqs.json │ ├── kenlm.yaml │ └── lqs.yaml ├── entry.sh ├── kenlm │ ├── README.md │ ├── entry.py │ ├── entry.sh │ └── model.py └── lqs │ ├── README.md │ ├── annotation │ ├── checkpointing.py │ ├── grad_utils.py │ ├── model_wrapper.py │ └── trainer.py │ ├── data_utils │ ├── __init__.py │ ├── base_datasets.py │ ├── data_scorer_datasets.py │ ├── distributed_indexed.py │ ├── indexed_dataset.py │ ├── lm_datasets.py │ └── prompt_datasets.py │ ├── entry.sh │ ├── infer_data_scorer.py │ ├── scorer │ ├── infer.py │ ├── modeling.py │ └── trainer.py │ ├── scripts │ ├── infer_data_scorer.sh │ ├── prepare_checkpoints.sh │ ├── prepare_full_dataset.sh │ ├── prepare_target_dataset.sh │ ├── proxy_data_annotation.sh │ ├── proxy_data_sampling.sh │ └── train_data_scorer.sh │ ├── tools │ ├── get_name_lqs.py │ ├── hf_download.py │ ├── prepare_data_scorer_train_data.py │ ├── process_data │ │ ├── end_sent_token_fairseq.json │ │ ├── end_sent_token_mistral.json │ │ ├── get_end_sent_tokens.py │ │ ├── hf_data_process.py │ │ ├── posttrain_data_process.py │ │ └── pretrain_data_process.py │ ├── sample_proxy_data.py │ └── token_data_bin2json.py │ ├── train_data_scorer.py │ └── train_eval_utils │ ├── __init__.py │ ├── base_evaluator.py │ ├── base_trainer.py │ ├── optimizers.py │ └── schedulers.py ├── data_selection ├── config │ ├── threshold.yaml │ ├── top-k.yaml │ └── top-r.yaml ├── entry.py ├── entry.sh ├── threshold.py ├── top_k.py └── top_r.py ├── figures ├── data_efficacy_paradigm.png ├── fig1_result.jpg ├── fig2_score.jpg └── fig3_order.jpg ├── model_eval ├── config │ ├── all.yaml │ ├── code.yaml │ ├── general.yaml │ └── math.yaml ├── entry.py ├── entry.sh └── lm_evaluation_harness.py ├── model_train ├── config │ ├── deepspeed.json │ └── train.yaml ├── entry.py ├── entry.sh └── trainer.py ├── requirements.txt └── utils.py /.amltignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/.amltignore -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/.gitignore -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/README.md -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/SECURITY.md -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/SUPPORT.md -------------------------------------------------------------------------------- /data_ordering/config/folding.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_ordering/config/folding.yaml -------------------------------------------------------------------------------- /data_ordering/config/shuffle.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_ordering/config/shuffle.yaml -------------------------------------------------------------------------------- /data_ordering/config/sorting.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_ordering/config/sorting.yaml -------------------------------------------------------------------------------- /data_ordering/entry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_ordering/entry.py -------------------------------------------------------------------------------- /data_ordering/entry.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_ordering/entry.sh -------------------------------------------------------------------------------- /data_ordering/folding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_ordering/folding.py -------------------------------------------------------------------------------- /data_ordering/shuffle.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_ordering/shuffle.py -------------------------------------------------------------------------------- /data_ordering/sorting.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_ordering/sorting.py -------------------------------------------------------------------------------- /data_scoring/config/deepspeed_lqs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/config/deepspeed_lqs.json -------------------------------------------------------------------------------- /data_scoring/config/kenlm.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/config/kenlm.yaml -------------------------------------------------------------------------------- /data_scoring/config/lqs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/config/lqs.yaml -------------------------------------------------------------------------------- /data_scoring/entry.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/entry.sh -------------------------------------------------------------------------------- /data_scoring/kenlm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/kenlm/README.md -------------------------------------------------------------------------------- /data_scoring/kenlm/entry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/kenlm/entry.py -------------------------------------------------------------------------------- /data_scoring/kenlm/entry.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/kenlm/entry.sh -------------------------------------------------------------------------------- /data_scoring/kenlm/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/kenlm/model.py -------------------------------------------------------------------------------- /data_scoring/lqs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/README.md -------------------------------------------------------------------------------- /data_scoring/lqs/annotation/checkpointing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/annotation/checkpointing.py -------------------------------------------------------------------------------- /data_scoring/lqs/annotation/grad_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/annotation/grad_utils.py -------------------------------------------------------------------------------- /data_scoring/lqs/annotation/model_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/annotation/model_wrapper.py -------------------------------------------------------------------------------- /data_scoring/lqs/annotation/trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/annotation/trainer.py -------------------------------------------------------------------------------- /data_scoring/lqs/data_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/data_utils/__init__.py -------------------------------------------------------------------------------- /data_scoring/lqs/data_utils/base_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/data_utils/base_datasets.py -------------------------------------------------------------------------------- /data_scoring/lqs/data_utils/data_scorer_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/data_utils/data_scorer_datasets.py -------------------------------------------------------------------------------- /data_scoring/lqs/data_utils/distributed_indexed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/data_utils/distributed_indexed.py -------------------------------------------------------------------------------- /data_scoring/lqs/data_utils/indexed_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/data_utils/indexed_dataset.py -------------------------------------------------------------------------------- /data_scoring/lqs/data_utils/lm_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/data_utils/lm_datasets.py -------------------------------------------------------------------------------- /data_scoring/lqs/data_utils/prompt_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/data_utils/prompt_datasets.py -------------------------------------------------------------------------------- /data_scoring/lqs/entry.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/entry.sh -------------------------------------------------------------------------------- /data_scoring/lqs/infer_data_scorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/infer_data_scorer.py -------------------------------------------------------------------------------- /data_scoring/lqs/scorer/infer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/scorer/infer.py -------------------------------------------------------------------------------- /data_scoring/lqs/scorer/modeling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/scorer/modeling.py -------------------------------------------------------------------------------- /data_scoring/lqs/scorer/trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/scorer/trainer.py -------------------------------------------------------------------------------- /data_scoring/lqs/scripts/infer_data_scorer.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/scripts/infer_data_scorer.sh -------------------------------------------------------------------------------- /data_scoring/lqs/scripts/prepare_checkpoints.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/scripts/prepare_checkpoints.sh -------------------------------------------------------------------------------- /data_scoring/lqs/scripts/prepare_full_dataset.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/scripts/prepare_full_dataset.sh -------------------------------------------------------------------------------- /data_scoring/lqs/scripts/prepare_target_dataset.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/scripts/prepare_target_dataset.sh -------------------------------------------------------------------------------- /data_scoring/lqs/scripts/proxy_data_annotation.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/scripts/proxy_data_annotation.sh -------------------------------------------------------------------------------- /data_scoring/lqs/scripts/proxy_data_sampling.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/scripts/proxy_data_sampling.sh -------------------------------------------------------------------------------- /data_scoring/lqs/scripts/train_data_scorer.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/scripts/train_data_scorer.sh -------------------------------------------------------------------------------- /data_scoring/lqs/tools/get_name_lqs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/get_name_lqs.py -------------------------------------------------------------------------------- /data_scoring/lqs/tools/hf_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/hf_download.py -------------------------------------------------------------------------------- /data_scoring/lqs/tools/prepare_data_scorer_train_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/prepare_data_scorer_train_data.py -------------------------------------------------------------------------------- /data_scoring/lqs/tools/process_data/end_sent_token_fairseq.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/process_data/end_sent_token_fairseq.json -------------------------------------------------------------------------------- /data_scoring/lqs/tools/process_data/end_sent_token_mistral.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/process_data/end_sent_token_mistral.json -------------------------------------------------------------------------------- /data_scoring/lqs/tools/process_data/get_end_sent_tokens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/process_data/get_end_sent_tokens.py -------------------------------------------------------------------------------- /data_scoring/lqs/tools/process_data/hf_data_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/process_data/hf_data_process.py -------------------------------------------------------------------------------- /data_scoring/lqs/tools/process_data/posttrain_data_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/process_data/posttrain_data_process.py -------------------------------------------------------------------------------- /data_scoring/lqs/tools/process_data/pretrain_data_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/process_data/pretrain_data_process.py -------------------------------------------------------------------------------- /data_scoring/lqs/tools/sample_proxy_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/sample_proxy_data.py -------------------------------------------------------------------------------- /data_scoring/lqs/tools/token_data_bin2json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/tools/token_data_bin2json.py -------------------------------------------------------------------------------- /data_scoring/lqs/train_data_scorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/train_data_scorer.py -------------------------------------------------------------------------------- /data_scoring/lqs/train_eval_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/train_eval_utils/__init__.py -------------------------------------------------------------------------------- /data_scoring/lqs/train_eval_utils/base_evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/train_eval_utils/base_evaluator.py -------------------------------------------------------------------------------- /data_scoring/lqs/train_eval_utils/base_trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/train_eval_utils/base_trainer.py -------------------------------------------------------------------------------- /data_scoring/lqs/train_eval_utils/optimizers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/train_eval_utils/optimizers.py -------------------------------------------------------------------------------- /data_scoring/lqs/train_eval_utils/schedulers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_scoring/lqs/train_eval_utils/schedulers.py -------------------------------------------------------------------------------- /data_selection/config/threshold.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_selection/config/threshold.yaml -------------------------------------------------------------------------------- /data_selection/config/top-k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_selection/config/top-k.yaml -------------------------------------------------------------------------------- /data_selection/config/top-r.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_selection/config/top-r.yaml -------------------------------------------------------------------------------- /data_selection/entry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_selection/entry.py -------------------------------------------------------------------------------- /data_selection/entry.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_selection/entry.sh -------------------------------------------------------------------------------- /data_selection/threshold.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_selection/threshold.py -------------------------------------------------------------------------------- /data_selection/top_k.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_selection/top_k.py -------------------------------------------------------------------------------- /data_selection/top_r.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/data_selection/top_r.py -------------------------------------------------------------------------------- /figures/data_efficacy_paradigm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/figures/data_efficacy_paradigm.png -------------------------------------------------------------------------------- /figures/fig1_result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/figures/fig1_result.jpg -------------------------------------------------------------------------------- /figures/fig2_score.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/figures/fig2_score.jpg -------------------------------------------------------------------------------- /figures/fig3_order.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/figures/fig3_order.jpg -------------------------------------------------------------------------------- /model_eval/config/all.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_eval/config/all.yaml -------------------------------------------------------------------------------- /model_eval/config/code.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_eval/config/code.yaml -------------------------------------------------------------------------------- /model_eval/config/general.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_eval/config/general.yaml -------------------------------------------------------------------------------- /model_eval/config/math.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_eval/config/math.yaml -------------------------------------------------------------------------------- /model_eval/entry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_eval/entry.py -------------------------------------------------------------------------------- /model_eval/entry.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_eval/entry.sh -------------------------------------------------------------------------------- /model_eval/lm_evaluation_harness.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_eval/lm_evaluation_harness.py -------------------------------------------------------------------------------- /model_train/config/deepspeed.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_train/config/deepspeed.json -------------------------------------------------------------------------------- /model_train/config/train.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_train/config/train.yaml -------------------------------------------------------------------------------- /model_train/entry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_train/entry.py -------------------------------------------------------------------------------- /model_train/entry.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_train/entry.sh -------------------------------------------------------------------------------- /model_train/trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/model_train/trainer.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/requirements.txt -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DELT/HEAD/utils.py --------------------------------------------------------------------------------