├── .gitignore ├── LICENSE ├── README.md ├── data ├── cptdata.py ├── dataset │ └── raw │ │ ├── QuALITY.v1.0.1.htmlstripped.dev │ │ └── QuALITY.v1.0.1.htmlstripped.train ├── entigraph.py ├── tokenize_entigraph.py ├── tokenize_instruct.py └── tokenize_redpj.py ├── evaluation.py ├── inference ├── devapi.py ├── llama.py ├── retrieval.py └── retry_wrapper.py ├── interactive.py ├── notebooks ├── nb_main_plot.ipynb └── nb_qa_eval.ipynb ├── requirements.txt ├── scripts ├── config │ └── fsdp_config.json ├── rag_sweep.sh └── train.sh ├── tasks ├── quality.py └── task_abc.py ├── train.py └── utils ├── io_utils.py ├── prompt_utils.py ├── prompts └── quality_retrieval_icl.txt └── python_utils.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/README.md -------------------------------------------------------------------------------- /data/cptdata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/data/cptdata.py -------------------------------------------------------------------------------- /data/dataset/raw/QuALITY.v1.0.1.htmlstripped.dev: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/data/dataset/raw/QuALITY.v1.0.1.htmlstripped.dev -------------------------------------------------------------------------------- /data/dataset/raw/QuALITY.v1.0.1.htmlstripped.train: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/data/dataset/raw/QuALITY.v1.0.1.htmlstripped.train -------------------------------------------------------------------------------- /data/entigraph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/data/entigraph.py -------------------------------------------------------------------------------- /data/tokenize_entigraph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/data/tokenize_entigraph.py -------------------------------------------------------------------------------- /data/tokenize_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/data/tokenize_instruct.py -------------------------------------------------------------------------------- /data/tokenize_redpj.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/data/tokenize_redpj.py -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/evaluation.py -------------------------------------------------------------------------------- /inference/devapi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/inference/devapi.py -------------------------------------------------------------------------------- /inference/llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/inference/llama.py -------------------------------------------------------------------------------- /inference/retrieval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/inference/retrieval.py -------------------------------------------------------------------------------- /inference/retry_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/inference/retry_wrapper.py -------------------------------------------------------------------------------- /interactive.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/interactive.py -------------------------------------------------------------------------------- /notebooks/nb_main_plot.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/notebooks/nb_main_plot.ipynb -------------------------------------------------------------------------------- /notebooks/nb_qa_eval.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/notebooks/nb_qa_eval.ipynb -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/config/fsdp_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/scripts/config/fsdp_config.json -------------------------------------------------------------------------------- /scripts/rag_sweep.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/scripts/rag_sweep.sh -------------------------------------------------------------------------------- /scripts/train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/scripts/train.sh -------------------------------------------------------------------------------- /tasks/quality.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/tasks/quality.py -------------------------------------------------------------------------------- /tasks/task_abc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/tasks/task_abc.py -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/train.py -------------------------------------------------------------------------------- /utils/io_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/utils/io_utils.py -------------------------------------------------------------------------------- /utils/prompt_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/utils/prompt_utils.py -------------------------------------------------------------------------------- /utils/prompts/quality_retrieval_icl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/utils/prompts/quality_retrieval_icl.txt -------------------------------------------------------------------------------- /utils/python_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitongYang/Synthetic_Continued_Pretraining/HEAD/utils/python_utils.py --------------------------------------------------------------------------------