├── .gitignore ├── LICENSE.txt ├── README.md ├── assets ├── MATES.png └── avatar.png ├── requirements.txt ├── scripts ├── eval.sh ├── predict_data_influence.sh ├── pretrain.sh ├── probe_oracle_data_influence.sh └── train_data_influence_model.sh ├── src ├── lit_gpt │ ├── __init__.py │ ├── config.py │ ├── gen.py │ ├── model.py │ ├── packed_dataset.py │ ├── rmsnorm.py │ ├── tokenizer.py │ └── utils.py ├── pretrain │ ├── lm_eval_harness.py │ └── pretrain.py └── select_data │ ├── modeling_data_influence_model.py │ ├── predict_data_influence.py │ ├── prepare_lambada.py │ ├── probe_oracle_data_influence.py │ ├── select_data.py │ └── train_data_influence_model.py └── tokenizer └── togethercomputer └── RedPajama-INCITE-Base-7B-v0.1 ├── special_tokens_map.json ├── tokenizer.json └── tokenizer_config.json /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .vscode/ 3 | results/ 4 | wandb/ 5 | data/ 6 | out/ 7 | .DS_Store -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/LICENSE.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/README.md -------------------------------------------------------------------------------- /assets/MATES.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/assets/MATES.png -------------------------------------------------------------------------------- /assets/avatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/assets/avatar.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/eval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/scripts/eval.sh -------------------------------------------------------------------------------- /scripts/predict_data_influence.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/scripts/predict_data_influence.sh -------------------------------------------------------------------------------- /scripts/pretrain.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/scripts/pretrain.sh -------------------------------------------------------------------------------- /scripts/probe_oracle_data_influence.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/scripts/probe_oracle_data_influence.sh -------------------------------------------------------------------------------- /scripts/train_data_influence_model.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/scripts/train_data_influence_model.sh -------------------------------------------------------------------------------- /src/lit_gpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/lit_gpt/__init__.py -------------------------------------------------------------------------------- /src/lit_gpt/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/lit_gpt/config.py -------------------------------------------------------------------------------- /src/lit_gpt/gen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/lit_gpt/gen.py -------------------------------------------------------------------------------- /src/lit_gpt/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/lit_gpt/model.py -------------------------------------------------------------------------------- /src/lit_gpt/packed_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/lit_gpt/packed_dataset.py -------------------------------------------------------------------------------- /src/lit_gpt/rmsnorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/lit_gpt/rmsnorm.py -------------------------------------------------------------------------------- /src/lit_gpt/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/lit_gpt/tokenizer.py -------------------------------------------------------------------------------- /src/lit_gpt/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/lit_gpt/utils.py -------------------------------------------------------------------------------- /src/pretrain/lm_eval_harness.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/pretrain/lm_eval_harness.py -------------------------------------------------------------------------------- /src/pretrain/pretrain.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/pretrain/pretrain.py -------------------------------------------------------------------------------- /src/select_data/modeling_data_influence_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/select_data/modeling_data_influence_model.py -------------------------------------------------------------------------------- /src/select_data/predict_data_influence.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/select_data/predict_data_influence.py -------------------------------------------------------------------------------- /src/select_data/prepare_lambada.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/select_data/prepare_lambada.py -------------------------------------------------------------------------------- /src/select_data/probe_oracle_data_influence.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/select_data/probe_oracle_data_influence.py -------------------------------------------------------------------------------- /src/select_data/select_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/select_data/select_data.py -------------------------------------------------------------------------------- /src/select_data/train_data_influence_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/src/select_data/train_data_influence_model.py -------------------------------------------------------------------------------- /tokenizer/togethercomputer/RedPajama-INCITE-Base-7B-v0.1/special_tokens_map.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/tokenizer/togethercomputer/RedPajama-INCITE-Base-7B-v0.1/special_tokens_map.json -------------------------------------------------------------------------------- /tokenizer/togethercomputer/RedPajama-INCITE-Base-7B-v0.1/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/tokenizer/togethercomputer/RedPajama-INCITE-Base-7B-v0.1/tokenizer.json -------------------------------------------------------------------------------- /tokenizer/togethercomputer/RedPajama-INCITE-Base-7B-v0.1/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cxcscmu/MATES/HEAD/tokenizer/togethercomputer/RedPajama-INCITE-Base-7B-v0.1/tokenizer_config.json --------------------------------------------------------------------------------