└── codes_datasets ├── DataCleaning ├── utils │ ├── .gitkeep │ ├── __init__.py │ ├── law_lowwords.txt │ ├── word2phrase │ ├── spam_words_wudao.txt │ ├── __pycache__ │ │ ├── util.cpython-38.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── general_policy.cpython-37.pyc │ │ ├── general_policy.cpython-38.pyc │ │ ├── special_policy.cpython-38.pyc │ │ ├── check_black_words.cpython-38.pyc │ │ └── clean_headtails_from_content.cpython-38.pyc │ ├── ebook_lowwords.txt │ ├── ads_predict_fasttext.py │ ├── check_political_toxic.py │ ├── opencc_t2s.py │ ├── ray_utils.py │ ├── split_bigdata_file.py │ ├── random_sample.py │ ├── pretrain_data_sampling.py │ ├── safetycheck_random_sample.py │ ├── clean_headtails_from_content.py │ ├── test60.py │ ├── tokenizer.py │ ├── special_policy.py │ └── ocr_nlp.py ├── clean │ ├── __init__.py │ └── hjyroot │ │ └── sftDataGen │ │ └── clean-dataset │ │ ├── jd_cleaned_v1.jsonl │ │ └── .ipynb_checkpoints │ │ └── jd_cleaned_v1-checkpoint.jsonl ├── requirements.txt ├── stopall.sh ├── fasttext │ ├── README.md │ └── run_fastext.sh ├── run_tokenizer.sh ├── README.md ├── run_data_cleaning.sh └── preprocess │ ├── .ipynb_checkpoints │ ├── preprocess_cn-baidu_weixin-checkpoint.py │ ├── preprocess_cn-e-txt-checkpoint.py │ ├── search_pretraindata-checkpoint.py │ ├── preprocess_cn-kindle-checkpoint.py │ ├── preprocess_cn-39health-checkpoint.py │ ├── preprocess_cn-sina_iask-checkpoint.py │ ├── mnbvc_prepare-checkpoint.py │ └── preprocess_cn-mnbvc-checkpoint.py │ └── preprocess_cn-wechat.py ├── Postraining_dpo ├── tests │ ├── __init__.py │ ├── test_e2e.py │ ├── testing_constants.py │ ├── test_core.py │ ├── testing_utils.py │ ├── test_best_of_n_sampler.py │ ├── test_ddpo_trainer.py │ ├── test_data_collator_completion_only.py │ └── test_iterative_sft_trainer.py ├── examples │ ├── accelerate_configs │ │ ├── path │ │ │ └── hostfile4 │ │ ├── multi_gpu.yaml │ │ ├── deepspeed_zero1.yaml │ │ ├── deepspeed_zero2.yaml │ │ ├── deepspeed_zero3.yaml │ │ ├── zero3_multi_nodes.yaml │ │ └── zero2_multi_nodes.yaml │ ├── research_projects │ │ ├── stack_llama_2 │ │ │ └── scripts │ │ │ │ ├── requirements.txt │ │ │ │ └── README.md │ │ ├── toxicity │ │ │ └── README.md │ │ ├── README.md │ │ ├── stack_llama │ │ │ └── scripts │ │ │ │ ├── README.md │ │ │ │ └── merge_peft_adapter.py │ │ └── tools │ │ │ └── calculator.py │ ├── notebooks │ │ └── README.md │ └── hello_world.py ├── scripts │ ├── hostfile4 │ ├── killall.sh │ ├── stopall.sh │ ├── accelerate_configs │ │ ├── single_gpu.yaml │ │ ├── multi_gpu.yaml │ │ ├── deepspeed_zero1.yaml │ │ ├── deepspeed_zero2.yaml │ │ ├── deepspeed_zero3.yaml │ │ └── fsdp_qlora.yaml │ ├── dpo_infer.sh │ ├── dpo_pairwise_winrate.sh │ ├── profile │ ├── dpo_pairwise_score.sh │ └── postrain_with_dpo.sh ├── trl │ ├── __pycache__ │ │ ├── core.cpython-310.pyc │ │ ├── core.cpython-38.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── __init__.cpython-310.pyc │ │ ├── env_utils.cpython-310.pyc │ │ ├── import_utils.cpython-310.pyc │ │ └── import_utils.cpython-38.pyc │ ├── models │ │ ├── __pycache__ │ │ │ ├── utils.cpython-310.pyc │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── modeling_base.cpython-310.pyc │ │ │ ├── modeling_base.cpython-38.pyc │ │ │ ├── modeling_value_head.cpython-38.pyc │ │ │ └── modeling_value_head.cpython-310.pyc │ │ ├── __init__.py │ │ └── auxiliary_modules.py │ ├── trainer │ │ ├── __pycache__ │ │ │ ├── base.cpython-310.pyc │ │ │ ├── base.cpython-38.pyc │ │ │ ├── utils.cpython-38.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── judges.cpython-310.pyc │ │ │ ├── utils.cpython-310.pyc │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── callbacks.cpython-310.pyc │ │ │ ├── ddpo_config.cpython-310.pyc │ │ │ ├── ddpo_config.cpython-38.pyc │ │ │ ├── dpo_config.cpython-310.pyc │ │ │ ├── dpo_trainer.cpython-310.pyc │ │ │ ├── dpo_trainer.cpython-38.pyc │ │ │ ├── model_config.cpython-310.pyc │ │ │ ├── ppo_config.cpython-310.pyc │ │ │ ├── ppo_trainer.cpython-310.pyc │ │ │ ├── sft_trainer.cpython-310.pyc │ │ │ ├── reward_trainer.cpython-310.pyc │ │ │ ├── training_configs.cpython-310.pyc │ │ │ └── iterative_sft_trainer.cpython-310.pyc │ │ ├── online_dpo_config.py │ │ ├── rloo_config.py │ │ ├── ppov2_config.py │ │ ├── base.py │ │ ├── reward_config.py │ │ ├── training_configs.py │ │ ├── orpo_config.py │ │ ├── sft_config.py │ │ ├── model_config.py │ │ ├── alignprop_config.py │ │ └── cpo_config.py │ ├── extras │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── best_of_n_sampler.cpython-310.pyc │ │ │ └── best_of_n_sampler.cpython-38.pyc │ │ ├── __init__.py │ │ └── dataset_formatting.py │ ├── commands │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ └── cli_utils.cpython-310.pyc │ │ ├── __init__.py │ │ └── cli.py │ ├── environment │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── base_environment.cpython-38.pyc │ │ │ └── base_environment.cpython-310.pyc │ │ └── __init__.py │ └── env_utils.py ├── xllm │ ├── __pycache__ │ │ ├── util.cpython-310.pyc │ │ ├── data_decrypt.cpython-310.pyc │ │ └── request_http.cpython-310.pyc │ ├── dpo_win_state.py │ └── llama_flash_attn_monkey_patch.py ├── killall.sh ├── stopall.sh ├── README.md └── requirements │ ├── pip_dpo_requirements.txt │ └── conda_dpo_requirements.txt ├── PreferData └── README.md └── SFTData └── README.md /codes_datasets/DataCleaning/utils/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/clean/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codes_datasets/PreferData/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Preference Data Recipe 4 | -------------------------------------------------------------------------------- /codes_datasets/SFTData/README.md: -------------------------------------------------------------------------------- 1 | 中英文指令数据集,需要根据自己的业务情况,进行清洗,指令数据的质量是影响效果的关键因素! 2 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/clean/hjyroot/sftDataGen/clean-dataset/jd_cleaned_v1.jsonl: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/law_lowwords.txt: -------------------------------------------------------------------------------- 1 | 审判员 2 | 书记员 3 | 审判长 4 | 人民陪审员 5 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/accelerate_configs/path/hostfile4: -------------------------------------------------------------------------------- 1 | ip1 slots=8 2 | ip2 slots=8 3 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/clean/hjyroot/sftDataGen/clean-dataset/.ipynb_checkpoints/jd_cleaned_v1-checkpoint.jsonl: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/word2phrase: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/word2phrase -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/spam_words_wudao.txt: -------------------------------------------------------------------------------- 1 | 图片发自简书app 2 | 原文地址: 3 | 综合网络,如有侵权联系删除。 4 | 本文章已经通过区块链技术进行版权认证,禁止任何形式的改编转载抄袭,违者追究法律责任 5 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/hostfile4: -------------------------------------------------------------------------------- 1 | 15.108.121.45 slots=8 2 | 15.108.121.46 slots=8 3 | 15.108.121.47 slots=8 4 | 15.108.121.48 slots=8 5 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/__pycache__/util.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/util.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/__pycache__/core.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/core.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/__pycache__/core.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/core.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/xllm/__pycache__/util.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/xllm/__pycache__/util.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/research_projects/stack_llama_2/scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | trl 3 | peft 4 | accelerate 5 | datasets 6 | bitsandbytes 7 | wandb 8 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/__pycache__/env_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/env_utils.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/__pycache__/general_policy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/general_policy.cpython-37.pyc -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/__pycache__/general_policy.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/general_policy.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/__pycache__/special_policy.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/special_policy.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/__pycache__/import_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/import_utils.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/__pycache__/import_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/import_utils.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/models/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/base.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/base.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/base.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/base.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/__pycache__/check_black_words.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/check_black_words.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/killall.sh: -------------------------------------------------------------------------------- 1 | source /etc/profile 2 | pdsh -w ssh:15.108.121.45,15.108.121.46,15.108.121.47,15.108.121.48 "bash /mnt/lptest/xubu/postrain/scripts/stopall.sh" 3 | 4 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/extras/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/extras/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/extras/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/extras/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/models/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/judges.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/judges.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/xllm/__pycache__/data_decrypt.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/xllm/__pycache__/data_decrypt.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/xllm/__pycache__/request_http.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/xllm/__pycache__/request_http.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/commands/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/commands/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/callbacks.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/callbacks.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/commands/__pycache__/cli_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/commands/__pycache__/cli_utils.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/environment/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/environment/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/environment/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/environment/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_base.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_base.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_base.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_base.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ddpo_config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ddpo_config.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ddpo_config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ddpo_config.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_config.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_trainer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_trainer.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_trainer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_trainer.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/model_config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/model_config.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ppo_config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ppo_config.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ppo_trainer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ppo_trainer.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/sft_trainer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/sft_trainer.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/reward_trainer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/reward_trainer.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/extras/__pycache__/best_of_n_sampler.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/extras/__pycache__/best_of_n_sampler.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/extras/__pycache__/best_of_n_sampler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/extras/__pycache__/best_of_n_sampler.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_value_head.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_value_head.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/training_configs.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/training_configs.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/__pycache__/clean_headtails_from_content.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/clean_headtails_from_content.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/environment/__pycache__/base_environment.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/environment/__pycache__/base_environment.cpython-38.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_value_head.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_value_head.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/environment/__pycache__/base_environment.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/environment/__pycache__/base_environment.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/__pycache__/iterative_sft_trainer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/iterative_sft_trainer.cpython-310.pyc -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/tests/test_e2e.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | 4 | def test_hello_world(): 5 | subprocess.run( 6 | "python examples/hello_world.py", 7 | shell=True, 8 | check=True, 9 | ) 10 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/killall.sh: -------------------------------------------------------------------------------- 1 | source /etc/profile 2 | pdsh -w ssh:10.208.111.45,10.208.112.209,10.208.109.54,10.208.110.235 "bash /mnt/lptest/xubu/postrain/stopall.sh" 3 | pdsh -w ssh:10.208.111.45,10.208.112.209,10.208.109.54,10.208.110.235 "rm -rf /tmp/*" 4 | 5 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/stopall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ps aux|grep postrain_with_dpo.sh |grep -v grep | awk '{print $2}' | while read pid ;do kill -9 $pid;done; 3 | ps aux|grep postrain.py |grep -v grep | awk '{print $2}' | while read pid ;do kill -9 $pid;done; 4 | 5 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/stopall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ps aux|grep postrain_with_dpo.sh |grep -v grep | awk '{print $2}' | while read pid ;do kill -9 $pid;done; 3 | ps aux|grep postrain.py |grep -v grep | awk '{print $2}' | while read pid ;do kill -9 $pid;done; 4 | 5 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/research_projects/toxicity/README.md: -------------------------------------------------------------------------------- 1 | # De-detoxifying language models 2 | 3 | To run this code, do the following: 4 | 5 | ```shell 6 | ACCELERATE_LOG_LEVEL=info accelerate launch --config_file {CONFIG} examples/toxicity/scripts/gpt-j-6b-toxicity.py --log_with wandb 7 | ``` -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | torch 3 | opencc 4 | flashtext 5 | tomark 6 | SentencePiece 7 | chardet 8 | apache-flink 9 | jieba 10 | pymysql 11 | dbutils 12 | xToolkit 13 | huggingface_hub 14 | openyxl 15 | jieba_fast 16 | TextBlob 17 | fpgrowth_py 18 | jsonlines 19 | openai 20 | pysubs2 21 | langid 22 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/stopall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ps aux|grep wanjuan_clean.py |grep -v grep | awk '{print $2}' | while read pid ;do kill -9 $pid;done; 3 | ps aux|grep tokenizer.py |grep -v grep | awk '{print $2}' | while read pid ;do kill -9 $pid;done; 4 | ps aux|grep random_sample.py |grep -v grep | awk '{print $2}' | while read pid ;do kill -9 $pid;done; 5 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/fasttext/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 🌱 Training fastext recipe 4 | ``` 5 | train_file = './data/clean_data.txt.train' 6 | python train.py \ 7 | --input_file $train_file \ 8 | --pretrain_w2v ./w2v/wiki-news-300d-1M.vec \ 9 | --model_file ./output_models/imagenet.bin \ 10 | --stop_word_path ./data/stop_words_en.txt \ 11 | --output_path ./data/imagenet_res.jsonl \ 12 | --mode train 13 | ``` 14 | 15 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/accelerate_configs/single_gpu.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: "NO" 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: 'bf16' 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/accelerate_configs/multi_gpu.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: 'bf16' 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/accelerate_configs/multi_gpu.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: 'bf16' 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/dpo_infer.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | 4 | models=("dpo_ckpt_llama-70b_5e6_3epoch" "dpo_ckpt_llama-70b_5e6_6epoch") 5 | for model in ${models[*]} 6 | do 7 | echo "infer model ${model} on test dataset of ${dataset_name} ..." 8 | CUDA_VISIBLE_DEVICES='0,1' python dpo_generation_vllm.py \ 9 | --input_file ../dpo_dataGen/xllm_eval_500_final.jsonl \ 10 | --model_name $model \ 11 | --batch_size 4 12 | done 13 | 14 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/ebook_lowwords.txt: -------------------------------------------------------------------------------- 1 | 声明: 2 | 声明 3 | 本书由 4 | 采集整理 5 | 仅供试读 6 | 版权归原作者所有 7 | 如有侵权 8 | 请联系本站 9 | 及时删除 10 | 提醒您: 11 | 提醒您 12 | 合理安排阅读时间 13 | 杜绝沉迷 14 | 网络小说 15 | 作者: 16 | 编简介 17 | 小说类别 18 | 总推荐 19 | 标签: 20 | 读者印象: 21 | 疑难解答 22 | 更新时间 23 | 本章字数 24 | 好书尽在疑难解答 25 | 网更新时间 26 | 本章字数 27 | 版权信息 28 | COPYRIGHT 29 | 书名 30 | 出版社 31 | 出版时间 32 | ISBN 33 | 授权得到 34 | APP电子版 35 | 制作与发行版权 36 | 所有侵权必究 37 | 更多精校小说 38 | 下载 39 | 更多小说下载 40 | 百度贴吧 41 | 收集整理 42 | 更多精校小说尽在 43 | 知轩藏书 44 | 全书完 45 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/environment/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from typing import TYPE_CHECKING 3 | from ..import_utils import _LazyModule 4 | 5 | _import_structure = { 6 | "base_environment": ["TextEnvironment", "TextHistory"], 7 | } 8 | 9 | if TYPE_CHECKING: 10 | from .base_environment import TextEnvironment, TextHistory 11 | else: 12 | import sys 13 | 14 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 15 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/fasttext/run_fastext.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | train_file=./data/clean_data.txt.train 4 | 5 | python train.py \ 6 | --input_file $train_file \ 7 | --pretrain_w2v ./w2v/wiki-news-300d-1M.vec \ 8 | --model_file ./output_models/imagenet.bin \ 9 | --stop_word_path ./data/stop_words_en.txt \ 10 | --output_path ./data/imagenet_res.jsonl \ 11 | --mode train 12 | if [ $? -ne 0 ]; then 13 | echo "train.py: ${train_file} failed." 14 | exit 15 | else 16 | echo "train.py: ${train_file} succeed." 17 | fi 18 | 19 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/accelerate_configs/deepspeed_zero1.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | gradient_accumulation_steps: 1 6 | zero3_init_flag: false 7 | zero_stage: 1 8 | distributed_type: DEEPSPEED 9 | downcast_bf16: 'no' 10 | machine_rank: 0 11 | main_training_function: main 12 | mixed_precision: 'bf16' 13 | num_machines: 1 14 | num_processes: 8 15 | rdzv_backend: static 16 | same_network: true 17 | tpu_env: [] 18 | tpu_use_cluster: false 19 | tpu_use_sudo: false 20 | use_cpu: false 21 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/accelerate_configs/deepspeed_zero1.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | gradient_accumulation_steps: 1 6 | zero3_init_flag: false 7 | zero_stage: 1 8 | distributed_type: DEEPSPEED 9 | downcast_bf16: 'no' 10 | machine_rank: 0 11 | main_training_function: main 12 | mixed_precision: 'bf16' 13 | num_machines: 1 14 | num_processes: 8 15 | rdzv_backend: static 16 | same_network: true 17 | tpu_env: [] 18 | tpu_use_cluster: false 19 | tpu_use_sudo: false 20 | use_cpu: false 21 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/dpo_pairwise_winrate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | models=("dpo_ckpt_llama-70b_5e6_3epoch" "dpo_ckpt_llama-70b_5e6_6epoch") 4 | for model_compared in ${models[*]} 5 | do 6 | for eval_file in 'xllmtest' 7 | do 8 | k1=sft-checkpoint-72800 9 | k2=$model_compared 10 | 11 | python win_tie_loss_stat.py \ 12 | -i1 ${k1}-${k2}-${eval_file}.json \ 13 | -k1 $k1 \ 14 | -i2 ${k2}-${k1}-${eval_file}.json \ 15 | -k2 $k2 \ 16 | --output_dir ./ \ 17 | --dst $eval_file 18 | done 19 | done 20 | 21 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/accelerate_configs/deepspeed_zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: 'bf16' 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false 22 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/accelerate_configs/deepspeed_zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | gradient_accumulation_steps: 1 6 | offload_optimizer_device: none 7 | offload_param_device: none 8 | zero3_init_flag: false 9 | zero_stage: 2 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: 'bf16' 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/research_projects/README.md: -------------------------------------------------------------------------------- 1 | # Research projects that use TRL 2 | 3 | Welcome to the research projects folder! Here you can find the scripts used for some research projects that used TRL and maintained by the developers and the community (LM de-toxification, Stack-Llama, etc.). Check out the READMEs in the subfolders for more information! 4 | 5 | - [De-detoxifying language models](https://github.com/huggingface/trl/tree/main/examples/research_projects/toxicity) 6 | - [Stack-Llama](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama) 7 | - [Stack-Llama-2](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2) -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/accelerate_configs/deepspeed_zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | gradient_accumulation_steps: 1 6 | offload_optimizer_device: none 7 | offload_param_device: none 8 | zero3_init_flag: true 9 | zero3_save_16bit_model: true 10 | zero_stage: 3 11 | distributed_type: DEEPSPEED 12 | downcast_bf16: 'no' 13 | machine_rank: 0 14 | main_training_function: main 15 | mixed_precision: 'bf16' 16 | num_machines: 1 17 | num_processes: 8 18 | rdzv_backend: static 19 | same_network: true 20 | tpu_env: [] 21 | tpu_use_cluster: false 22 | tpu_use_sudo: false 23 | use_cpu: false 24 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/run_tokenizer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset="Alpaca52k" 4 | clearning_version="v1" 5 | 6 | src_dir="/Users/mac/Downloads/sft/paper/sft_train4" 7 | dest_dir="/Users/mac/Downloads/sft/paper/sft_data" 8 | num_workers=1 9 | 10 | tokenizer_path="/mnt/public/open_source_AI/Meta-Llama-3.1-8B-Instruct" 11 | python utils/tokenizer.py \ 12 | --dataset_name ${dataset} \ 13 | --dataset_path ${src_dir} \ 14 | --output_path ${dest_dir} \ 15 | --tokenizer_path ${tokenizer_path} \ 16 | --version ${clearning_version} \ 17 | --num_workers ${num_workers} 18 | if [ $? -ne 0 ]; then 19 | echo "tokenizer.py failed." 20 | exit 21 | else 22 | echo "tokenizer.py succeed." 23 | fi 24 | 25 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/profile: -------------------------------------------------------------------------------- 1 | # /etc/profile: system-wide .profile file for the Bourne shell (sh(1)) 2 | # and Bourne compatible shells (bash(1), ksh(1), ash(1), ...). 3 | 4 | if [ "${PS1-}" ]; then 5 | if [ "${BASH-}" ] && [ "$BASH" != "/bin/sh" ]; then 6 | # The file bash.bashrc already sets the default PS1. 7 | # PS1='\h:\w\$ ' 8 | if [ -f /etc/bash.bashrc ]; then 9 | . /etc/bash.bashrc 10 | fi 11 | else 12 | if [ "$(id -u)" -eq 0 ]; then 13 | PS1='# ' 14 | else 15 | PS1='$ ' 16 | fi 17 | fi 18 | fi 19 | 20 | if [ -d /etc/profile.d ]; then 21 | for i in /etc/profile.d/*.sh; do 22 | if [ -r $i ]; then 23 | . $i 24 | fi 25 | done 26 | unset i 27 | fi 28 | export PDSH_RCMD_TYPE=ssh 29 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/accelerate_configs/zero3_multi_nodes.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_hostfile: /path/hostfile 5 | deepspeed_multinode_launcher: pdsh 6 | gradient_accumulation_steps: 1 7 | offload_optimizer_device: none 8 | offload_param_device: none 9 | zero3_init_flag: true 10 | zero3_save_16bit_model: true 11 | zero_stage: 3 12 | distributed_type: DEEPSPEED 13 | downcast_bf16: 'no' 14 | machine_rank: 0 15 | main_process_ip: 10.208.110.235 16 | main_process_port: 50528 17 | main_training_function: main 18 | mixed_precision: bf16 19 | num_machines: 4 20 | num_processes: 32 21 | rdzv_backend: static 22 | same_network: false 23 | tpu_env: [] 24 | tpu_use_cluster: false 25 | tpu_use_sudo: false 26 | use_cpu: false 27 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/accelerate_configs/deepspeed_zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_hostfile: /mnt/lptest/xubu/postrain/scripts/hostfile4 5 | deepspeed_multinode_launcher: pdsh 6 | gradient_accumulation_steps: 2 7 | offload_optimizer_device: none 8 | offload_param_device: none 9 | zero3_init_flag: true 10 | zero3_save_16bit_model: true 11 | zero_stage: 3 12 | distributed_type: DEEPSPEED 13 | main_process_ip: 15.108.121.45 14 | main_process_port: 5158 15 | downcast_bf16: 'no' 16 | machine_rank: 0 17 | main_training_function: main 18 | mixed_precision: bf16 19 | num_machines: 4 20 | num_processes: 32 21 | rdzv_backend: static 22 | same_network: true 23 | tpu_env: [] 24 | tpu_use_cluster: false 25 | tpu_use_sudo: false 26 | use_cpu: false 27 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/accelerate_configs/zero2_multi_nodes.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_hostfile: /data/usr/pangwei/frontllm/sft/dpo/examples/accelerate_configs/path/hostfile 5 | deepspeed_multinode_launcher: pdsh 6 | gradient_accumulation_steps: 1 7 | offload_optimizer_device: none 8 | offload_param_device: none 9 | zero3_init_flag: false 10 | zero_stage: 2 11 | distributed_type: DEEPSPEED 12 | downcast_bf16: 'no' 13 | machine_rank: 0 14 | main_process_ip: 172.16.19.45 15 | main_process_port: 2222 16 | main_training_function: main 17 | mixed_precision: bf16 18 | num_machines: 2 19 | num_processes: 16 20 | rdzv_backend: static 21 | same_network: false 22 | tpu_env: [] 23 | tpu_use_cluster: false 24 | tpu_use_sudo: false 25 | use_cpu: false 26 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/online_dpo_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from typing import Literal 4 | 5 | from trl.trainer.utils import OnPolicyConfig 6 | 7 | 8 | @dataclass 9 | class OnlineDPOConfig(OnPolicyConfig): 10 | exp_name: str = os.path.basename(__file__)[: -len(".py")] 11 | """the name of this experiment""" 12 | reward_model_path: str = "EleutherAI/pythia-160m" 13 | """the path to the reward model""" 14 | 15 | num_epochs: int = 4 16 | """the number of epochs to train""" 17 | 18 | beta: float = 0.05 19 | """the entropy regularization coefficient of DPO""" 20 | loss_type: Literal["sigmoid", "ipo"] = "sigmoid" 21 | """the type of loss to use for online DPO""" 22 | disable_dropout: bool = True 23 | """whether to disable dropout of the model during training""" 24 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Notebooks 2 | 3 | This directory contains a collection of Jupyter notebooks that demonstrate how to use the TRL library in different applications. 4 | 5 | - [`best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb): This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO. 6 | - [`gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb): This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. 7 | - [`gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment-control.ipynb): This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook. 8 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/rloo_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | 4 | from ..trainer.utils import OnPolicyConfig 5 | 6 | 7 | @dataclass 8 | class RLOOConfig(OnPolicyConfig): 9 | exp_name: str = os.path.basename(__file__)[: -len(".py")] 10 | """the name of this experiment""" 11 | reward_model_path: str = "EleutherAI/pythia-160m" 12 | """the path to the reward model""" 13 | 14 | # ppo config 15 | num_ppo_epochs: int = 4 16 | """the number of epochs to train""" 17 | whiten_rewards: bool = False 18 | """whether to whiten the rewards""" 19 | kl_coef: float = 0.05 20 | """the KL coefficient""" 21 | cliprange: float = 0.2 22 | """the clip range""" 23 | 24 | # rloo config 25 | rloo_k: int = 2 26 | """REINFORCE Leave-One-Out (RLOO) number of online samples per prompt""" 27 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/tests/testing_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | CI_HUB_USER = "__DUMMY_TRANSFORMERS_USER__" 16 | CI_HUB_USER_FULL_NAME = "Dummy User" 17 | CI_HUB_USER_TOKEN = "hf_94wBhPGp6KrrTH3KDchhKpRxZwd6dmHWLL" 18 | 19 | CI_HUB_ENDPOINT = "https://hub-ci.huggingface.co" 20 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | Author: xubuvd 3 | Date: 13/08/2024 4 | Email: xubuvd@163.com 5 | ``` 6 | 7 | # 🌱 数据清洗方案 - Data Cleaning Recipe 8 | 它包含四个主要阶段:
9 | 1. **初始数据清洗**:对28个特定领域的数据集应用多种启发式过滤方法。
10 | 2. **文档级去重**:使用 MiniHash 去除重复文档。
11 | 3. **统计分析**:使用 Llama3.1-8b-Instruct 模型分析总词汇量。
12 | 4. **人工评估**:对100个数据点进行抽样和手动审查。
13 |
14 | It consists of four main stages:
15 | 1. **Initial Data Cleaning**: Apply various heuristic filtering methods to 28 domain-specific datasets.
16 | 2. **Document-Level Deduplication**: Use MiniHash to remove duplicate documents.
17 | 3. **Statistical Analysis**: Analyze the total number of tokens using the Llama3.1-8b-Instruct model.
18 | 4. **Human Evaluation**: Conduct a manual review by sampling 100 data points.
19 | 20 | # 🍂 启动和暂停 - running and killing 21 | ``` 22 | nohup bash run_data_cleaning.sh > r.log 2>&1 & 23 | bash stopall.sh 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/README.md: -------------------------------------------------------------------------------- 1 | 2 | # See pip. conda envs at ./requirements/ 3 | ``` 4 | trl, v0.9.6 5 | conda_dpo_requirements.txt 6 | pip_dpo_requirements.txt 7 | ``` 8 | # step1: Post-Training with Direct Preference Optimization 9 | ``` 10 | bash scripts/postrain_with_dpo.sh 11 | ``` 12 | 13 | ## Input data formation 14 | traing data stored in jsonl style, one line is as follows: 15 | ``` 16 | {"id":"1","source":"xllm_dataset","prompt":"","chosen":"","reject":""} 17 | {"id":"2","source":"xllm_dataset","prompt":"","chosen":"","reject":""} 18 | {"id":"3","source":"xllm_dataset","prompt":"","chosen":"","reject":""} 19 | ... 20 | ``` 21 | 22 | # step2: Make the trained model infering with vllm 23 | ``` 24 | bash dpo_infer.sh 25 | ``` 26 | 27 | # step3: Compare performance of two models of DPO vs. SFT with gpt4-0613 28 | ``` 29 | bash dpo_pairwise_score.sh 30 | ``` 31 | 32 | # step4: Calculate win-rate for two compared models 33 | ``` 34 | bash dpo_pairwise_winrate.sh 35 | ``` 36 | 37 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/ppov2_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | 4 | from ..trainer.utils import OnPolicyConfig 5 | 6 | 7 | @dataclass 8 | class PPOv2Config(OnPolicyConfig): 9 | exp_name: str = os.path.basename(__file__)[: -len(".py")] 10 | """the name of this experiment""" 11 | reward_model_path: str = "EleutherAI/pythia-160m" 12 | """the path to the reward model""" 13 | 14 | # ppo config 15 | num_ppo_epochs: int = 4 16 | """the number of epochs to train""" 17 | whiten_rewards: bool = False 18 | """whether to whiten the rewards""" 19 | kl_coef: float = 0.05 20 | """the KL coefficient""" 21 | cliprange: float = 0.2 22 | """the clip range""" 23 | vf_coef: float = 0.1 24 | """the value function coefficient""" 25 | cliprange_value: float = 0.2 26 | """the clip range for the value function""" 27 | gamma: float = 1 28 | """the discount factor""" 29 | lam: float = 0.95 30 | """the lambda value for GAE""" 31 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/ads_predict_fasttext.py: -------------------------------------------------------------------------------- 1 | # _*_coding:utf-8 _*_ 2 | import json 3 | import logging 4 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 5 | import fasttext 6 | from tqdm import tqdm 7 | 8 | #加载模型 9 | model = fasttext.load_model('./fastText_shortAd/models/fasttext_train.model.bin') 10 | 11 | labels_right = [] 12 | texts = [] 13 | labels_predict = [] 14 | 15 | with open("/data/data_warehouse/llm/source_data/cn-wechat/wx_data_980.jsonl") as fr: 16 | datas = fr.readlines() 17 | for idx,line in tqdm(enumerate(datas),total=len(datas)): 18 | line = line.strip() 19 | if len(line) < 5: continue 20 | js_dict = json.loads(line) 21 | text = js_dict["content"].strip() 22 | text = text.replace("\n"," ") 23 | label_predict = model.predict(text) 24 | labels_predict.append(label_predict[0]) 25 | print ("文本: ",text[0:200]) 26 | print ("预测label: ",label_predict[0]) 27 | print("-"*60) 28 | 29 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/accelerate_configs/fsdp_qlora.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | fsdp_config: 6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 7 | fsdp_backward_prefetch: BACKWARD_PRE 8 | fsdp_cpu_ram_efficient_loading: true 9 | fsdp_forward_prefetch: false 10 | fsdp_offload_params: true 11 | fsdp_sharding_strategy: FULL_SHARD 12 | fsdp_state_dict_type: SHARDED_STATE_DICT 13 | fsdp_sync_module_states: true 14 | fsdp_use_orig_params: false 15 | machine_rank: 0 16 | main_training_function: main 17 | mixed_precision: 'bf16' 18 | num_machines: 1 19 | num_processes: 8 20 | rdzv_backend: static 21 | same_network: true 22 | tpu_env: [] 23 | tpu_use_cluster: false 24 | tpu_use_sudo: false 25 | use_cpu: false -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/extras/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | # Copyright 2022 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | from typing import TYPE_CHECKING 17 | 18 | from ..import_utils import _LazyModule 19 | 20 | 21 | _import_structure = { 22 | "best_of_n_sampler": ["BestOfNSampler"], 23 | } 24 | 25 | if TYPE_CHECKING: 26 | from .best_of_n_sampler import BestOfNSampler 27 | else: 28 | import sys 29 | 30 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 31 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/dpo_pairwise_score.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # gpt-3.5-turbo-0613 4 | # gpt-4-0613 5 | 6 | models=("dpo_ckpt_llama-70b_5e6_3epoch" "dpo_ckpt_llama-70b_5e6_6epoch") 7 | for model in ${models[*]} 8 | do 9 | for eval_file in 'frontis' 10 | do 11 | k1=sft-70b_final_95w_3epoch 12 | k2=$model 13 | scorer=gpt-4-0613 14 | 15 | echo "pairwise compare between ${model} and ${k1} on ${eval_file} ..." 16 | python xllm/dpo_pairwise_score_by_gpt4.py \ 17 | -i1 ./evaluation/results/${k1}/${eval_file}/seed_3517.json \ 18 | -i2 ./evaluation/results/${k2}/${eval_file}/seed_3517.json \ 19 | -k1 $k1 \ 20 | -k2 $k2 \ 21 | --batch_size 10 \ 22 | --max_tokens 32 \ 23 | --output_dir ./ \ 24 | --eval_scorer $scorer 25 | 26 | python xllm/dpo_pairwise_score_by_gpt4.py \ 27 | -i1 ./evaluation/results/${k2}/${eval_file}/seed_3517.json \ 28 | -i2 ./evaluation/results/${k1}/${eval_file}/seed_3517.json \ 29 | -k1 $k2 \ 30 | -k2 $k1 \ 31 | --batch_size 10 \ 32 | --max_tokens 32 \ 33 | --output_dir ./ \ 34 | --eval_scorer $scorer 35 | done 36 | done 37 | 38 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | # Copyright 2024 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # flake8: noqa 17 | 18 | from typing import TYPE_CHECKING 19 | from ..import_utils import _LazyModule, OptionalDependencyNotAvailable 20 | 21 | 22 | _import_structure = { 23 | "cli_utils": ["SFTScriptArguments", "init_zero_verbose", "DPOScriptArguments", "TrlParser", "YamlConfigParser"], 24 | } 25 | 26 | 27 | if TYPE_CHECKING: 28 | from .cli_utils import SFTScriptArguments, init_zero_verbose, DPOScriptArguments, TrlParser, YamlConfigParser 29 | else: 30 | import sys 31 | 32 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 33 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/check_political_toxic.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(r"..") 3 | from utils.util import load_set_from_txt 4 | _PolityToxic_ = "utils/political_toxic.txt" 5 | 6 | 7 | from flashtext import KeywordProcessor 8 | 9 | class CheckToxicWords(): 10 | def __init__(self): 11 | ''' 12 | self.political_words_set = load_set_from_txt(_PolityPersons_) 13 | self.sex_words = load_set_from_txt(_Sex_words_) 14 | self.ad_words = load_set_from_txt(_Ad_words_) 15 | ''' 16 | 17 | self.processor = KeywordProcessor() 18 | self.processor.add_keyword_from_file(_PolityToxic_) 19 | 20 | ''' 21 | aa = keyword_processor.extract_keywords('周杰伦是歌星在吉林大路开演唱会,导演国内有冯小刚,苏有朋演的是五阿哥,他现在居住在北京') 22 | print(aa) 23 | 运行结果: 24 | ['明星', '路名', '明星', '明星', '地名'] 25 | ''' 26 | 27 | def is_toxic_text(self,text,thresh_hold=1): 28 | res = self.processor.extract_keywords(text) 29 | # ["politician","badword","gumble","sex","ads","dirty"] 30 | if len(res) >= thresh_hold: return True,"_".join(res) 31 | return False,"" 32 | 33 | def checking_political_words(self,text): 34 | pass 35 | 36 | def checking_sex_words(self,text): 37 | pass 38 | 39 | def checking_ad_words(self,text): 40 | pass 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/env_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Function `strtobool` copied and adapted from `distutils` (as deprected 16 | # in Python 3.10). 17 | # Reference: https://github.com/python/cpython/blob/48f9d3e3faec5faaa4f7c9849fecd27eae4da213/Lib/distutils/util.py#L308-L321 18 | 19 | 20 | def strtobool(val: str) -> bool: 21 | """Convert a string representation of truth to True or False booleans. 22 | 23 | True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values 24 | are 'n', 'no', 'f', 'false', 'off', and '0'. 25 | 26 | Raises: 27 | ValueError: if 'val' is anything else. 28 | """ 29 | val = val.lower() 30 | if val in ("y", "yes", "t", "true", "on", "1"): 31 | return True 32 | if val in ("n", "no", "f", "false", "off", "0"): 33 | return False 34 | raise ValueError(f"Invalid truth value, it should be a string but {val} was provided instead.") 35 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/hello_world.py: -------------------------------------------------------------------------------- 1 | # 0. imports 2 | import torch 3 | from transformers import GPT2Tokenizer 4 | 5 | from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer 6 | 7 | 8 | # 1. load a pretrained model 9 | model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2") 10 | model_ref = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2") 11 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2") 12 | tokenizer.pad_token = tokenizer.eos_token 13 | 14 | # 2. initialize trainer 15 | ppo_config = {"batch_size": 1} 16 | config = PPOConfig(**ppo_config) 17 | ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer) 18 | 19 | # 3. encode a query 20 | query_txt = "This morning I went to the " 21 | query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device) 22 | 23 | # 4. generate model response 24 | generation_kwargs = { 25 | "min_length": -1, 26 | "top_k": 0.0, 27 | "top_p": 1.0, 28 | "do_sample": True, 29 | "pad_token_id": tokenizer.eos_token_id, 30 | "max_new_tokens": 20, 31 | } 32 | response_tensor = ppo_trainer.generate([item for item in query_tensor], return_prompt=False, **generation_kwargs) 33 | response_txt = tokenizer.decode(response_tensor[0]) 34 | 35 | # 5. define a reward for response 36 | # (this could be any reward such as human feedback or output from another model) 37 | reward = [torch.tensor(1.0, device=model.pretrained_model.device)] 38 | 39 | # 6. train model with ppo 40 | train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward) 41 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/xllm/dpo_win_state.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import time 5 | from tqdm import tqdm 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument( 10 | "--input_file", 11 | type=str, 12 | default="./dpo_ckpt_70b_32k_v6-sft-Llama3-70b_final_4epoch-frontis.json", 13 | help="" 14 | ) 15 | parser.add_argument( 16 | "--win_var", 17 | type=int, 18 | default=4, 19 | help="" 20 | ) 21 | args = parser.parse_args() 22 | 23 | js_dict_list = json.load(open(args.input_file)) 24 | total_win_var_num = 0 25 | total_loss_var_num = 0 26 | for idx,js_dict in tqdm(enumerate(js_dict_list),total=len(js_dict_list)): 27 | score_list = js_dict['score'] 28 | dpo_score = int(score_list[0]) 29 | sft_core = int(score_list[1]) 30 | 31 | if dpo_score >= 10 - args.win_var and sft_core <= args.win_var: 32 | total_win_var_num += 1 33 | if sft_core >= 10 - args.win_var and dpo_score <= args.win_var: 34 | total_loss_var_num += 1 35 | print("total numbers of DPO eval dataset: ", len(js_dict_list)) 36 | print("total significantly improved instances that below {} score: {}".format(args.win_var,total_win_var_num)) 37 | print("total significantly declined instances that below {} score: {}".format(args.win_var,total_loss_var_num)) 38 | print("significantly improved ratio: {}%".format(100.0*total_win_var_num/len(js_dict_list))) 39 | print("significantly declined ratio: {}%".format(100.0*total_loss_var_num/len(js_dict_list))) 40 | 41 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/tests/test_core.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import unittest 15 | 16 | import torch 17 | 18 | from trl.core import masked_mean, masked_var, masked_whiten, whiten 19 | 20 | 21 | class CoreTester(unittest.TestCase): 22 | """ 23 | A wrapper class for testing core utils functions 24 | """ 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.test_input = torch.Tensor([1, 2, 3, 4]) 29 | cls.test_mask = torch.Tensor([0, 1, 1, 0]) 30 | cls.test_input_unmasked = cls.test_input[1:3] 31 | 32 | def test_masked_mean(self): 33 | self.assertEqual(torch.mean(self.test_input_unmasked), masked_mean(self.test_input, self.test_mask)) 34 | 35 | def test_masked_var(self): 36 | self.assertEqual(torch.var(self.test_input_unmasked), masked_var(self.test_input, self.test_mask)) 37 | 38 | def test_masked_whiten(self): 39 | whiten_unmasked = whiten(self.test_input_unmasked) 40 | whiten_masked = masked_whiten(self.test_input, self.test_mask)[1:3] 41 | diffs = (whiten_unmasked - whiten_masked).sum() 42 | self.assertAlmostEqual(diffs, 0) 43 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/research_projects/stack_llama_2/scripts/README.md: -------------------------------------------------------------------------------- 1 | # DPO pipeline for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model 2 | 3 | ## Prerequisites 4 | 5 | Install all the dependencies in the `requirements.txt`: 6 | 7 | ``` 8 | $ pip install -U -r requirements.txt 9 | ``` 10 | 11 | Since we will use `accelerate` for training, make sure to run: 12 | ``` 13 | $ accelerate config 14 | ``` 15 | 16 | ## Training 17 | 18 | There were two main steps to the DPO training process: 19 | 1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se: 20 | - `accelerate launch examples/stack_llama_2/scripts/sft_llama2.py --training_args.output_dir="sft"` 21 | 1. Run the DPO trainer using the model saved by the previous step: 22 | - `accelerate launch examples/stack_llama_2/scripts/dpo_llama2.py --model_name_or_path="sft/final_checkpoint" --output_dir="dpo"` 23 | 24 | 25 | ## Merging the adaptors 26 | 27 | To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL: 28 | 29 | ``` 30 | python trl/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="dpo/final_checkpoint/" --output_name="stack-llama-2" 31 | ``` 32 | 33 | which will also push the model to your HuggingFace hub account. 34 | 35 | ## Running the model 36 | 37 | We can load the DPO-trained LoRA adaptors which were saved by the DPO training step and load them via: 38 | 39 | ```py 40 | from peft import AutoPeftModelForCausalLM 41 | 42 | 43 | model = AutoPeftModelForCausalLM.from_pretrained( 44 | "dpo/final_checkpoint", 45 | low_cpu_mem_usage=True, 46 | torch_dtype=torch.float16, 47 | load_in_4bit=True, 48 | ) 49 | 50 | model.generate(...) 51 | ``` 52 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/run_data_cleaning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset="cn-long_context-rewrite" 4 | clearning_version="v9" 5 | 6 | source_dir="/localdisk/llm/source_data/${dataset}" 7 | dest_dir="/localdisk/llm/clean_data/${dataset}/${clearning_version}" 8 | num_workers=32 9 | 10 | # Step1: Perform dataset cleaning 11 | python clean/wechat_clean.py \ 12 | --num_workers ${num_workers} \ 13 | --dataset_name ${dataset} \ 14 | --source_path ${source_dir} \ 15 | --dest_path ${dest_dir} 16 | if [ $? -ne 0 ]; then 17 | echo "${dataset}_clean.py failed." 18 | exit 19 | else 20 | echo "${dataset}_clean.py succeed." 21 | fi 22 | < --streaming --no_gradient_checkpointing --learning_rate 1e-5 --max_steps 5000 --output_dir ./llama-se` 5 | 2. Reward modeling using dialog pairs from the SE dataset using the llama-7b-se to create llama-7b-se-rm: 6 | - `torchrun --nnodes 1 --nproc_per_node 8 examples/stack_llama/scripts/reward_modeling.py --model_name=` 7 | 3. RL fine-tuning of llama-7b-se with the llama-7b-se-rm reward model: 8 | - `accelerate launch --multi_gpu --num_machines 1 --num_processes 8 examples/stack_llama/scripts/rl_training.py --log_with=wandb --model_name= --reward_model_name= --adafactor=False --tokenizer_name= --save_freq=100 --output_max_length=128 --batch_size=8 --gradient_accumulation_steps=8 --batched_gen=True --ppo_epochs=4 --seed=0 --learning_rate=1.4e-5 --early_stopping=True --output_dir=llama-se-rl-finetune-128-8-8-1.4e-5_adam` 9 | 10 | 11 | LoRA layers were using at all stages to reduce memory requirements. 12 | At each stage the peft adapter layers were merged with the base model, using: 13 | ```shell 14 | python examples/stack_llama/scripts/merge_peft_adapter.py --adapter_model_name=XXX --base_model_name=YYY --output_name=ZZZ 15 | ``` 16 | Note that this script requires `peft>=0.3.0`. 17 | 18 | For access to the base llama-7b model, please see Meta's [release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) and [request form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform). 19 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from huggingface_hub import PyTorchModelHubMixin 16 | 17 | 18 | class BaseTrainer(PyTorchModelHubMixin): 19 | r""" 20 | Base class for all trainers - this base class implements the basic functions that we 21 | need for a trainer. 22 | 23 | The trainer needs to have the following functions: 24 | - step: takes in a batch of data and performs a step of training 25 | - loss: takes in a batch of data and returns the loss 26 | - compute_rewards: takes in a batch of data and returns the rewards 27 | - _build_models_and_tokenizer: builds the models and tokenizer 28 | - _build_dataset: builds the dataset 29 | Each user is expected to implement their own trainer class that inherits from this base 30 | if they want to use a new training algorithm. 31 | """ 32 | 33 | def __init__(self, config): 34 | self.config = config 35 | 36 | def step(self, *args): 37 | raise NotImplementedError("Not implemented") 38 | 39 | def loss(self, *args): 40 | raise NotImplementedError("Not implemented") 41 | 42 | def compute_rewards(self, *args): 43 | raise NotImplementedError("Not implemented") 44 | 45 | def _save_pretrained(self, save_directory): 46 | raise NotImplementedError("Not implemented") 47 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/opencc_t2s.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import re 4 | from tqdm import tqdm 5 | import opencc 6 | import argparse 7 | from tqdm import tqdm 8 | from os import listdir, path 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--source_path', 13 | type=str, 14 | default="/data/data_warehouse/llm/llm-data-org.del/cn-wiki2", 15 | help='Directory containing trained actor model') 16 | parser.add_argument('--dest_path', 17 | type=str, 18 | default="/data/data_warehouse/llm/llm-data-org.del/", 19 | help='Directory containing trained actor model') 20 | 21 | args = parser.parse_args() 22 | return args 23 | 24 | 25 | def split_cn_wiki(args): 26 | files = sorted(listdir(args.source_path)) 27 | 28 | WikiDir = os.path.join(args.dest_path, "cn-wiki2_t2s") 29 | if not os.path.exists(WikiDir): 30 | os.makedirs(WikiDir, exist_ok=True) 31 | 32 | converter = opencc.OpenCC('t2s.json') 33 | 34 | for input_file in tqdm(files,total=len(files)): 35 | 36 | ifile = os.path.join(args.source_path,input_file) 37 | 38 | wiki_output_file = os.path.join(WikiDir,input_file) 39 | if os.path.exists(wiki_output_file): os.remove(wiki_output_file) 40 | wiki_fo = open(wiki_output_file, 'a+', encoding='utf-8') 41 | 42 | for line in open(ifile,'r',encoding="utf-8"): 43 | line = line.strip() 44 | if len(line) < 5:continue 45 | js_dict = json.loads(line) 46 | content = converter.convert(js_dict["content"]) 47 | js_dict["content"] = content 48 | jstr = json.dumps(js_dict, ensure_ascii=False) 49 | wiki_fo.write(jstr+"\n") 50 | wiki_fo.close() 51 | 52 | if __name__ == '__main__': 53 | 54 | args = parse_args() 55 | split_cn_wiki(args) 56 | 57 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/ray_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import ray 3 | import argparse 4 | 5 | def change_extension(filename, new_extension): 6 | base_name = os.path.splitext(filename)[0] # 获取文件名(不包含扩展名) 7 | new_filename = f"{base_name}.{new_extension}" # 构建新的文件名 8 | return new_filename 9 | 10 | def process_files_extension(folder_path, raw_extension, new_extension): 11 | for filename in os.listdir(folder_path): 12 | if filename.endswith(f".{raw_extension}"): 13 | old_path = os.path.join(folder_path, filename) 14 | new_filename = change_extension(filename, f"{new_extension}") # 修改后缀为"modified" 15 | new_path = os.path.join(folder_path, new_filename) 16 | os.rename(old_path, new_path) 17 | print(f"重命名文件:{filename} -> {new_filename}") 18 | 19 | def test_folder(dest_path): 20 | if not os.path.exists(dest_path): 21 | os.makedirs(dest_path, exist_ok=True) 22 | GoodDir = os.path.join(dest_path, "good") 23 | BadDir = os.path.join(dest_path, "bad") 24 | 25 | if not os.path.exists(GoodDir): 26 | os.makedirs(GoodDir, exist_ok=True) 27 | if not os.path.exists(BadDir): 28 | os.makedirs(BadDir,exist_ok=True) 29 | 30 | return GoodDir, BadDir 31 | 32 | def parse_args(): 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--source_path', 35 | type=str, 36 | default="/data/datacleansing/test", 37 | help='Directory containing trained actor model') 38 | parser.add_argument('--dest_path', 39 | type=str, 40 | default="/data/datacleansing/test_store", 41 | help='Directory containing trained actor model') 42 | parser.add_argument('--dataset_name', 43 | type=str, 44 | default="", 45 | help="") 46 | args = parser.parse_args() 47 | return args 48 | 49 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/reward_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | from typing import Optional 17 | 18 | from transformers import TrainingArguments 19 | 20 | 21 | @dataclass 22 | class RewardConfig(TrainingArguments): 23 | """ 24 | RewardConfig collects all training arguments related to the [`RewardTrainer`] class. 25 | 26 | Using [`HfArgumentParser`] we can turn this class into 27 | [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the 28 | command line. 29 | 30 | Parameters: 31 | max_length (`int`, *optional*, defaults to `None`): 32 | The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. 33 | gradient_checkpointing (`bool`, *optional*, defaults to `True`): 34 | If True, use gradient checkpointing to save memory at the expense of slower backward pass. 35 | """ 36 | 37 | max_length: Optional[int] = None 38 | """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.""" 39 | dataset_num_proc: Optional[int] = None 40 | """Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.""" 41 | center_rewards_coefficient: Optional[float] = None 42 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/training_configs.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # coding=utf-8 3 | # Copyright 2023 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | from dataclasses import dataclass 17 | from typing import Optional 18 | 19 | from transformers import TrainingArguments 20 | 21 | 22 | @dataclass 23 | class RewardConfig(TrainingArguments): 24 | """ 25 | RewardConfig collects all training arguments related to the [`RewardTrainer`] class. 26 | 27 | Using [`HfArgumentParser`] we can turn this class into 28 | [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the 29 | command line. 30 | 31 | Parameters: 32 | max_length (`int`, *optional*, defaults to `None`): 33 | The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. 34 | gradient_checkpointing (`bool`, *optional*, defaults to `True`): 35 | If True, use gradient checkpointing to save memory at the expense of slower backward pass. 36 | """ 37 | 38 | max_length: Optional[int] = None 39 | """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.""" 40 | gradient_checkpointing: Optional[bool] = True 41 | """If True, use gradient checkpointing to save memory at the expense of slower backward pass.""" 42 | gradient_checkpointing_kwargs: Optional[dict] = None 43 | """Keyword arguments to pass to the gradient checkpointing function.""" 44 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | import torch 5 | from peft import PeftConfig, PeftModel 6 | from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser 7 | 8 | 9 | @dataclass 10 | class ScriptArguments: 11 | """ 12 | The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the 13 | merged model. 14 | """ 15 | 16 | adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"}) 17 | base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"}) 18 | output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"}) 19 | 20 | 21 | parser = HfArgumentParser(ScriptArguments) 22 | script_args = parser.parse_args_into_dataclasses()[0] 23 | assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge" 24 | assert script_args.base_model_name is not None, "please provide the name of the Base model" 25 | assert script_args.output_name is not None, "please provide the output name of the merged model" 26 | 27 | peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name) 28 | if peft_config.task_type == "SEQ_CLS": 29 | # The sequence classification task is used for the reward model in PPO 30 | model = AutoModelForSequenceClassification.from_pretrained( 31 | script_args.base_model_name, num_labels=1, torch_dtype=torch.bfloat16 32 | ) 33 | else: 34 | model = AutoModelForCausalLM.from_pretrained( 35 | script_args.base_model_name, return_dict=True, torch_dtype=torch.bfloat16 36 | ) 37 | 38 | tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name) 39 | 40 | # Load the PEFT model 41 | model = PeftModel.from_pretrained(model, script_args.adapter_model_name) 42 | model.eval() 43 | 44 | model = model.merge_and_unload() 45 | 46 | model.save_pretrained(f"{script_args.output_name}") 47 | tokenizer.save_pretrained(f"{script_args.output_name}") 48 | model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False) 49 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/models/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | # Copyright 2022 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # flake8: noqa 17 | 18 | from typing import TYPE_CHECKING 19 | from ..import_utils import _LazyModule, is_diffusers_available, OptionalDependencyNotAvailable 20 | 21 | 22 | _import_structure = { 23 | "modeling_base": ["PreTrainedModelWrapper", "create_reference_model"], 24 | "modeling_value_head": [ 25 | "AutoModelForCausalLMWithValueHead", 26 | "AutoModelForSeq2SeqLMWithValueHead", 27 | ], 28 | "utils": ["setup_chat_format", "SUPPORTED_ARCHITECTURES", "unwrap_model_for_generation"], 29 | } 30 | 31 | try: 32 | if not is_diffusers_available(): 33 | raise OptionalDependencyNotAvailable() 34 | except OptionalDependencyNotAvailable: 35 | pass 36 | else: 37 | _import_structure["modeling_sd_base"] = [ 38 | "DDPOPipelineOutput", 39 | "DDPOSchedulerOutput", 40 | "DDPOStableDiffusionPipeline", 41 | "DefaultDDPOStableDiffusionPipeline", 42 | ] 43 | 44 | if TYPE_CHECKING: 45 | from .modeling_base import PreTrainedModelWrapper, create_reference_model 46 | from .modeling_value_head import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead 47 | from .utils import setup_chat_format, SUPPORTED_ARCHITECTURES 48 | 49 | try: 50 | if not is_diffusers_available(): 51 | raise OptionalDependencyNotAvailable() 52 | except OptionalDependencyNotAvailable: 53 | pass 54 | else: 55 | from .modeling_sd_base import ( 56 | DDPOPipelineOutput, 57 | DDPOSchedulerOutput, 58 | DDPOStableDiffusionPipeline, 59 | DefaultDDPOStableDiffusionPipeline, 60 | ) 61 | else: 62 | import sys 63 | 64 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 65 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/split_bigdata_file.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import gzip 4 | import argparse 5 | import chardet 6 | from tqdm import tqdm 7 | from os import listdir, path 8 | 9 | def split(args): 10 | global_file_no = 0 11 | global_id_no = 0 12 | 13 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 14 | if os.path.exists(dest_file): os.remove(dest_file) 15 | of = open(dest_file,'w',encoding='utf-8') 16 | 17 | subsets = sorted(listdir(args.source_path)) 18 | for dir_no,file_name in tqdm(enumerate(subsets),total=len(subsets)): 19 | 20 | input_file = os.path.join(args.source_path,file_name) 21 | with open(input_file, 'r',encoding='utf-8') as f: 22 | for line in f: 23 | line = line.strip() 24 | if len(line) < 1:continue 25 | js_dict = json.loads(line) 26 | #js_dict["id"] = js_dict["note_id"] 27 | #del js_dict["note_id"] 28 | print(json.dumps(js_dict,ensure_ascii=False),file=of) 29 | if of.tell() > args.max_size: 30 | of.close() 31 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 32 | if os.path.exists(dest_file): os.remove(dest_file) 33 | of = open(dest_file,'w',encoding='utf-8') 34 | global_file_no += 1 35 | of.close() 36 | 37 | 38 | def parse_args(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('--source_path', 41 | type=str, 42 | default="/data/tianqingxiang/data/llm/ocr/ocr_infer_result/200W", 43 | help='Directory containing trained actor model') 44 | parser.add_argument('--dest_path', 45 | type=str, 46 | default="/root/llm/source_data/cn-JD-ocrtext", 47 | help='Directory containing trained actor model') 48 | parser.add_argument('--dataset_name', 49 | type=str, 50 | default="cn-JD-ocrtext", 51 | help="") 52 | parser.add_argument('--max_size', 53 | type=int, 54 | default=200*1024*1024, 55 | help="max chunk size") 56 | args = parser.parse_args() 57 | return args 58 | 59 | if __name__ == "__main__": 60 | args = parse_args() 61 | 62 | if not os.path.exists(args.dest_path): 63 | os.makedirs(args.dest_path, exist_ok=True) 64 | split(args) 65 | 66 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/commands/cli.py: -------------------------------------------------------------------------------- 1 | # This file is a copy of trl/examples/scripts/sft.py so that we could 2 | # use it together with rich and the TRL CLI in a more customizable manner. 3 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | import os 17 | import subprocess 18 | import sys 19 | from subprocess import CalledProcessError 20 | 21 | from rich.console import Console 22 | 23 | 24 | SUPPORTED_COMMANDS = ["sft", "dpo", "chat"] 25 | 26 | 27 | def main(): 28 | console = Console() 29 | # Make sure to import things locally to avoid verbose from third party libs. 30 | with console.status("[bold purple]Welcome! Initializing the TRL CLI..."): 31 | from trl.commands.cli_utils import init_zero_verbose 32 | 33 | init_zero_verbose() 34 | 35 | command_name = sys.argv[1] 36 | 37 | if command_name not in SUPPORTED_COMMANDS: 38 | raise ValueError( 39 | f"Please use one of the supported commands, got {command_name} - supported commands are {SUPPORTED_COMMANDS}" 40 | ) 41 | 42 | trl_examples_dir = os.path.dirname(__file__) 43 | 44 | # Force-use rich if the `TRL_USE_RICH` env var is not set 45 | if "TRL_USE_RICH" not in os.environ: 46 | os.environ["TRL_USE_RICH"] = "1" 47 | 48 | if command_name == "chat": 49 | command = f""" 50 | python {trl_examples_dir}/scripts/{command_name}.py {" ".join(sys.argv[2:])} 51 | """ 52 | else: 53 | command = f""" 54 | accelerate launch {trl_examples_dir}/scripts/{command_name}.py {" ".join(sys.argv[2:])} 55 | """ 56 | 57 | try: 58 | subprocess.run( 59 | command.split(), 60 | text=True, 61 | check=True, 62 | encoding="utf-8", 63 | cwd=os.getcwd(), 64 | env=os.environ.copy(), 65 | ) 66 | except (CalledProcessError, ChildProcessError) as exc: 67 | console.log(f"TRL - {command_name.upper()} failed on ! See the logs above for further details.") 68 | raise ValueError("TRL CLI failed! Check the traceback above..") from exc 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/random_sample.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import glob 4 | import re 5 | import math 6 | import json 7 | import argparse 8 | import random 9 | from tqdm import tqdm 10 | import hashlib 11 | 12 | 13 | def random_sample_benchmark(sample_num,input_dir): 14 | 15 | files = sorted(glob.glob(os.path.join(input_dir,"*.jsonl"), recursive=True)) 16 | 17 | avg_nums_per_task = math.ceil(sample_num/len(files)) 18 | 19 | sample_1000 = [] 20 | for file in tqdm(files,total=len(files)): 21 | filename = os.path.basename(file).replace(".jsonl","") 22 | data = [] 23 | for line in open(file,"r",encoding='utf-8'): 24 | try: 25 | js = json.loads(line.strip()) 26 | except json.decoder.JSONDecodeError: 27 | print(line) 28 | data.append(js) 29 | random.shuffle(data) 30 | print(f"process file {filename}, total of {len(data)}.") 31 | sample_1000.extend(data[0:avg_nums_per_task]) 32 | return sample_1000 33 | 34 | def parse_args(): 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('--dataset_name', 37 | type=str, 38 | default="xhs", 39 | help='dataset name') 40 | parser.add_argument('--dataset_path', 41 | type=str, 42 | default="/yuan1.0/open_source_1T", 43 | help='source path') 44 | parser.add_argument('--output_path', 45 | type=str, 46 | default="./", 47 | help='source path') 48 | 49 | parser.add_argument("--number_sample", 50 | type=int, 51 | default=100, 52 | help="number of sampled data" 53 | ) 54 | parser.add_argument('--version', 55 | type=str, 56 | default="v1", 57 | help="" 58 | ) 59 | args = parser.parse_args() 60 | return args 61 | 62 | if __name__ == '__main__': 63 | 64 | args = parse_args() 65 | 66 | sample_benchmark = [] 67 | 68 | sample_benchmark = random_sample_benchmark( 69 | sample_num=args.number_sample, 70 | input_dir=args.dataset_path 71 | ) 72 | print(f"sampling benchmark questions: {len(sample_benchmark)}") 73 | 74 | output_file=f"{args.output_path}/{args.dataset_name}-sample{args.number_sample}-{args.version}.jsonl" 75 | if os.path.exists(output_file): os.remove(output_file) 76 | fo = open(output_file, 'w', encoding='utf-8') 77 | 78 | for idx, item in tqdm(enumerate(sample_benchmark),total=len(sample_benchmark)): 79 | item["id"] = idx + 1 80 | jstr = json.dumps(item, ensure_ascii=False) 81 | fo.write(jstr+"\n") 82 | fo.close() 83 | print(f"Output file {output_file}, total sampled {len(sample_benchmark)}") 84 | 85 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-baidu_weixin-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import chardet 5 | from tqdm import tqdm 6 | from os import listdir, path 7 | 8 | 9 | def make_clean(args): 10 | 11 | global_file_no = 0 12 | global_id_no = 0 13 | 14 | 15 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 16 | if os.path.exists(dest_file): os.remove(dest_file) 17 | global_file_no += 1 18 | of = open(dest_file,'w',encoding='utf-8') 19 | 20 | subfiles = sorted(listdir(args.source_path)) 21 | for dir_no,subfile in tqdm(enumerate(subfiles),total=len(subfiles)): 22 | 23 | input_file = os.path.join(args.source_path,subfile) 24 | 25 | with open(input_file, 'r') as f: 26 | datalist = f.readlines() 27 | 28 | for line in datalist: 29 | line = line.strip() 30 | if len(line) < 1: 31 | continue 32 | 33 | js_data = json.loads(line) 34 | js_dict = {} 35 | js_dict["id"] = global_id_no 36 | js_dict["source"] = "cn-baidu-weixin" 37 | js_dict["source_id"] = js_data['url'] 38 | js_dict["subset"] = js_data["search_keyword"] 39 | js_dict["content"] = js_data["content"] 40 | global_id_no += 1 41 | 42 | print(json.dumps(js_dict,ensure_ascii=False),file=of) 43 | if of.tell() > args.max_size: 44 | of.close() 45 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 46 | if os.path.exists(dest_file): os.remove(dest_file) 47 | of = open(dest_file,'w',encoding='utf-8') 48 | global_file_no += 1 49 | of.close() 50 | 51 | def parse_args(): 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument('--source_path', 54 | type=str, 55 | default="/data/data_warehouse/SourceData/baidu_weixin/231027/", 56 | help='Directory containing trained actor model') 57 | parser.add_argument('--dest_path', 58 | type=str, 59 | default="/localdisk/llm/source_data/cn-baidu-weixin", 60 | help='Directory containing trained actor model') 61 | parser.add_argument('--dataset_name', 62 | type=str, 63 | default="cn-baidu-weixin", 64 | help="") 65 | parser.add_argument('--max_size', 66 | type=int, 67 | default=200 * 1024 * 1024, 68 | help="max chunk size") 69 | args = parser.parse_args() 70 | return args 71 | 72 | if __name__ == "__main__": 73 | args = parse_args() 74 | 75 | if not os.path.exists(args.dest_path): 76 | os.makedirs(args.dest_path, exist_ok=True) 77 | make_clean(args) 78 | 79 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/scripts/postrain_with_dpo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | log_out=0 5 | only_print=0 6 | dist_only_print=0 7 | enable_flash_attn="True" 8 | tie_embed="False" 9 | 10 | datestr=`date +"%Y-%m-%d"` 11 | wandb_run_name="dpo-sftExp8.3-Qwen1.5-14B-cp1006-$datestr" 12 | 13 | output_path=/mnt/lptest/sftExp8.3-Qwen1.5-14B-checkpoint-1006-post 14 | ckpt_path=/mnt/lptest/sftExp8.3-Qwen1.5-14B/sftExp8.3-Qwen1.5-14B-checkpoint-1006 15 | model_type="Qwen" 16 | 17 | pd_token="3xxxx" 18 | data_suffix="*.jsonl" 19 | train_data_path=/mnt/lptest/xubu/dpo_dataGen/dpo_preference_data/train 20 | dev_data_path=/mnt/lptest/xubu/dpo_dataGen/dpo_preference_data/dev 21 | 22 | num_processes=32 23 | beta=0.1 24 | bs_per_dev=2 25 | grad_acc_steps=2 26 | 27 | # save model per 500 global_step (2B token, 3h) 28 | ckpt_steps=281 29 | eval_steps=281 30 | 31 | # Direct Preference Optimization 32 | train_epoch=3 33 | lr=5e-6 34 | warmup_ratio=0.02 35 | 36 | max_length=2048 37 | max_prompt_length=1024 38 | max_target_length=1024 39 | 40 | strategy=zero3 41 | sanity_check=False 42 | 43 | # Run Command 44 | REPO=$(pwd) 45 | config=$REPO/scripts/accelerate_configs/deepspeed_${strategy}.yaml 46 | echo "config file: $config" 47 | 48 | CMD="" 49 | 50 | CMD="$CMD PYTHONPATH=$REPO" 51 | CMD="$CMD accelerate launch" 52 | 53 | CMD="$CMD --num_processes=$num_processes --config_file=$config" 54 | CMD="$CMD $REPO/xllm/postrain.py" 55 | 56 | CMD="$CMD --bf16 --beta $beta --model_name_or_path $ckpt_path --learning_rate $lr --model_architecture_type $model_type" 57 | CMD="$CMD --per_device_train_batch_size $bs_per_dev --gradient_accumulation_steps $grad_acc_steps" 58 | 59 | CMD="$CMD --max_length $max_length --max_prompt_length $max_prompt_length --max_target_length $max_target_length" 60 | CMD="$CMD --sanity_check $sanity_check --report_to 'wandb' --run_name $wandb_run_name" 61 | 62 | # --tie_word_embeddings $tie_embed --enable_flash_attn $enable_flash_attn 63 | CMD="$CMD --ignore_bias_buffers False --logging_steps 1" 64 | # --tie_word_embeddings $tie_embed --enable_flash_attn $enable_flash_attn 65 | 66 | CMD="$CMD --train_dataset_path $train_data_path --test_dataset_path $dev_data_path --data_suffix $data_suffix" 67 | 68 | CMD="$CMD --eval_steps $eval_steps --num_train_epochs $train_epoch --warmup_ratio $warmup_ratio" 69 | 70 | CMD="$CMD --output_dir $output_path --save_steps $ckpt_steps --logging_first_step --no_remove_unused_columns" 71 | 72 | CMD="$CMD --gradient_checkpointing True --weight_decay 0.1 --max_grad_norm 1.0" 73 | 74 | 75 | echo $CMD 76 | printf "===== Running Command =====\n" 77 | printf "\t%s\n\n" "$CMD" 78 | 79 | if [[ $only_print == "0" ]]; then 80 | printf "===== Command Logs =====\n" 81 | if [[ $log_out == "1" ]]; then 82 | echo "Command is running...." 83 | echo "Please run [tail -f ${log_file}] in another shell to monitoring the running process." 84 | fi 85 | if [[ -d $REPO/logs ]]; then 86 | timestamp=$(date +"%Y%m%d.%H.%M.%S") 87 | mv $REPO/logs $REPO/logs.$timestamp 88 | fi 89 | mkdir $REPO/logs 90 | eval "$CMD" 91 | fi 92 | 93 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-e-txt-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import chardet 5 | from tqdm import tqdm 6 | from os import listdir, path 7 | 8 | 9 | def make_clean(args): 10 | 11 | global_file_no = 0 12 | global_id_no = 0 13 | 14 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 15 | if os.path.exists(dest_file): os.remove(dest_file) 16 | global_file_no += 1 17 | of = open(dest_file,'w',encoding='utf-8') 18 | 19 | subfiles = sorted(listdir(args.source_path)) 20 | for dir_no,subfile in tqdm(enumerate(subfiles),total=len(subfiles)): 21 | 22 | input_file = os.path.join(args.source_path,subfile) 23 | if not (input_file.endswith(".txt") or input_file.endswith(".shtml")): continue 24 | 25 | html_str = open(input_file, 'rb').read() 26 | encoding_info = chardet.detect(html_str) 27 | original_encoding = encoding_info['encoding'] 28 | if original_encoding not in ["UTF-8","GB2312","GB18030","Big5","utf-8","UTF-16","UTF-32"]: continue 29 | 30 | html_str = html_str.decode(original_encoding, 'ignore')#.encode('utf-8') 31 | if len(html_str) < 256: continue 32 | 33 | js_dict = {} 34 | js_dict["id"] = global_id_no 35 | js_dict["source"] = "cn-e-txt" 36 | js_dict["subset"] = os.path.basename(subfile).replace(".txt","") 37 | js_dict["source_id"] = "" 38 | global_id_no += 1 39 | 40 | js_dict["content"] = html_str 41 | 42 | print(json.dumps(js_dict,ensure_ascii=False),file=of) 43 | if of.tell() > args.max_size: 44 | of.close() 45 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 46 | if os.path.exists(dest_file): os.remove(dest_file) 47 | of = open(dest_file,'w',encoding='utf-8') 48 | global_file_no += 1 49 | of.close() 50 | 51 | 52 | def parse_args(): 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument('--source_path', 55 | type=str, 56 | default="/data/data_warehouse/SourceData/txt", 57 | help='Directory containing trained actor model') 58 | parser.add_argument('--dest_path', 59 | type=str, 60 | default="/localdisk/llm/source_data/cn-e-txt", 61 | help='Directory containing trained actor model') 62 | parser.add_argument('--dataset_name', 63 | type=str, 64 | default="cn-e-txt", 65 | help="") 66 | parser.add_argument('--max_size', 67 | type=int, 68 | default=200 * 1024 * 1024, 69 | help="max chunk size") 70 | args = parser.parse_args() 71 | return args 72 | 73 | if __name__ == "__main__": 74 | args = parse_args() 75 | 76 | if not os.path.exists(args.dest_path): 77 | os.makedirs(args.dest_path, exist_ok=True) 78 | make_clean(args) 79 | 80 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/tests/testing_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import unittest 15 | 16 | import torch 17 | 18 | from trl import is_diffusers_available, is_peft_available, is_wandb_available, is_xpu_available 19 | 20 | 21 | def require_peft(test_case): 22 | """ 23 | Decorator marking a test that requires peft. Skips the test if peft is not available. 24 | """ 25 | if not is_peft_available(): 26 | test_case = unittest.skip("test requires peft")(test_case) 27 | return test_case 28 | 29 | 30 | def require_diffusers(test_case): 31 | """ 32 | Decorator marking a test that requires diffusers. Skips the test if diffusers is not available. 33 | """ 34 | if not is_diffusers_available(): 35 | test_case = unittest.skip("test requires diffusers")(test_case) 36 | return test_case 37 | 38 | 39 | def require_wandb(test_case, required: bool = True): 40 | """ 41 | Decorator marking a test that requires wandb. Skips the test if wandb is not available. 42 | """ 43 | # XOR, i.e.: 44 | # skip if available and required = False and 45 | # skip if not available and required = True 46 | if is_wandb_available() ^ required: 47 | test_case = unittest.skip("test requires wandb")(test_case) 48 | return test_case 49 | 50 | 51 | def require_no_wandb(test_case): 52 | """ 53 | Decorator marking a test that requires no wandb. Skips the test if wandb is available. 54 | """ 55 | return require_wandb(test_case, required=False) 56 | 57 | 58 | def require_bitsandbytes(test_case): 59 | """ 60 | Decorator marking a test that requires bitsandbytes. Skips the test if bitsandbytes is not available. 61 | """ 62 | try: 63 | import bitsandbytes # noqa: F401 64 | except ImportError: 65 | test_case = unittest.skip("test requires bitsandbytes")(test_case) 66 | return test_case 67 | 68 | 69 | def require_torch_multi_gpu(test_case): 70 | """ 71 | Decorator marking a test that requires multiple GPUs. Skips the test if there aren't enough GPUs. 72 | """ 73 | if torch.cuda.device_count() < 2: 74 | test_case = unittest.skip("test requires multiple GPUs")(test_case) 75 | return test_case 76 | 77 | 78 | def require_torch_multi_xpu(test_case): 79 | """ 80 | Decorator marking a test that requires multiple XPUs. Skips the test if there aren't enough XPUs. 81 | """ 82 | if torch.xpu.device_count() < 2 and is_xpu_available(): 83 | test_case = unittest.skip("test requires multiple XPUs")(test_case) 84 | return test_case 85 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/requirements/pip_dpo_requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.28.0 2 | aiohappyeyeballs==2.3.5 3 | aiohttp==3.10.3 4 | aiosignal==1.3.1 5 | annotated-types==0.7.0 6 | anyio==4.4.0 7 | async-timeout==4.0.3 8 | attrs==24.2.0 9 | cachetools==5.4.0 10 | certifi==2024.7.4 11 | charset-normalizer==3.3.2 12 | click==8.1.7 13 | cloudpickle==3.0.0 14 | cmake==3.30.2 15 | datasets==2.21.0 16 | deepspeed==0.14.5 17 | dill==0.3.8 18 | diskcache==5.6.3 19 | distro==1.9.0 20 | docker-pycreds==0.4.0 21 | docstring_parser==0.16 22 | einops==0.8.0 23 | exceptiongroup==1.2.2 24 | fastapi==0.112.0 25 | fastchat==0.1.0 26 | filelock==3.15.4 27 | flash_attn==2.6.3 28 | frozenlist==1.4.1 29 | fsspec==2024.6.1 30 | gitdb==4.0.11 31 | GitPython==3.1.43 32 | h11==0.14.0 33 | hjson==3.1.0 34 | httpcore==1.0.5 35 | httptools==0.6.1 36 | httpx==0.27.0 37 | huggingface-hub==0.24.5 38 | idna==3.7 39 | interegular==0.3.3 40 | Jinja2==3.1.4 41 | jiter==0.5.0 42 | joblib==1.4.2 43 | jsonschema==4.23.0 44 | jsonschema-specifications==2023.12.1 45 | lark==1.2.2 46 | llvmlite==0.43.0 47 | lm-format-enforcer==0.10.1 48 | loguru==0.7.2 49 | markdown-it-py==3.0.0 50 | MarkupSafe==2.1.5 51 | mdurl==0.1.2 52 | mpmath==1.3.0 53 | msgpack==1.0.8 54 | multidict==6.0.5 55 | multiprocess==0.70.16 56 | nest-asyncio==1.6.0 57 | networkx==3.3 58 | ninja==1.11.1.1 59 | numba==0.60.0 60 | numpy==1.26.4 61 | nvidia-cublas-cu12==12.1.3.1 62 | nvidia-cuda-cupti-cu12==12.1.105 63 | nvidia-cuda-nvrtc-cu12==12.1.105 64 | nvidia-cuda-runtime-cu12==12.1.105 65 | nvidia-cudnn-cu12==8.9.2.26 66 | nvidia-cufft-cu12==11.0.2.54 67 | nvidia-curand-cu12==10.3.2.106 68 | nvidia-cusolver-cu12==11.4.5.107 69 | nvidia-cusparse-cu12==12.1.0.106 70 | nvidia-ml-py==12.535.161 71 | nvidia-nccl-cu12==2.20.5 72 | nvidia-nvjitlink-cu12==12.6.20 73 | nvidia-nvtx-cu12==12.1.105 74 | nvitop==1.3.2 75 | openai==1.40.6 76 | outlines==0.0.34 77 | packaging==24.1 78 | pandas==2.2.2 79 | platformdirs==4.2.2 80 | prometheus-fastapi-instrumentator==7.0.0 81 | prometheus_client==0.20.0 82 | protobuf==5.27.3 83 | psutil==6.0.0 84 | py-cpuinfo==9.0.0 85 | pyarrow==17.0.0 86 | pycryptodome==3.20.0 87 | pydantic==2.8.2 88 | pydantic_core==2.20.1 89 | Pygments==2.18.0 90 | python-dateutil==2.9.0.post0 91 | python-dotenv==1.0.1 92 | pytz==2024.1 93 | PyYAML==6.0.2 94 | ray==2.34.0 95 | referencing==0.35.1 96 | regex==2024.7.24 97 | requests==2.32.3 98 | rich==13.7.1 99 | rpds-py==0.20.0 100 | safetensors==0.4.4 101 | scipy==1.14.0 102 | sentencepiece==0.2.0 103 | sentry-sdk==2.13.0 104 | setproctitle==1.3.3 105 | shtab==1.7.1 106 | six==1.16.0 107 | smmap==5.0.1 108 | sniffio==1.3.1 109 | starlette==0.37.2 110 | sympy==1.13.2 111 | termcolor==2.4.0 112 | tiktoken==0.7.0 113 | tokenizers==0.15.2 114 | torch==2.3.0 115 | tqdm==4.66.5 116 | transformers==4.38.2 117 | triton==2.3.0 118 | trl==0.9.6 119 | typing_extensions==4.12.2 120 | tyro==0.8.8 121 | tzdata==2024.1 122 | urllib3==2.2.2 123 | uvicorn==0.30.6 124 | uvloop==0.19.0 125 | vllm==0.4.3 126 | vllm-flash-attn==2.5.8.post2 127 | wandb==0.17.7 128 | watchfiles==0.23.0 129 | websockets==12.0 130 | xformers==0.0.26.post1 131 | xxhash==3.5.0 132 | yarl==1.9.4 133 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/pretrain_data_sampling.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import glob 4 | import re 5 | import math 6 | import json 7 | import argparse 8 | import random 9 | from tqdm import tqdm 10 | import hashlib 11 | 12 | def random_sample_benchmark(sample_num,input_dir): 13 | 14 | files = sorted(glob.glob(os.path.join(input_dir,"v*"), recursive=True)) 15 | good_dir = os.path.join(input_dir,files[-1],"good") 16 | 17 | input_files = sorted(glob.glob(os.path.join(good_dir,"*.jsonl"), recursive=True)) 18 | avg_nums_per_task = math.ceil(sample_num/len(input_files)) 19 | 20 | sample_1000 = [] 21 | for file in tqdm(input_files,total=len(input_files)): 22 | filename = os.path.basename(file).replace(".jsonl","") 23 | data = [] 24 | for line in open(file,"r",encoding='utf-8'): 25 | try: 26 | js = json.loads(line.strip()) 27 | except json.decoder.JSONDecodeError: 28 | print(line) 29 | data.append(js) 30 | random.shuffle(data) 31 | print(f"process file {filename}, total of {len(data)}.") 32 | sample_1000.extend(data[0:avg_nums_per_task]) 33 | return sample_1000 34 | 35 | def parse_args(): 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument('--dataset_name', 38 | type=str, 39 | default="SafetyCheck", 40 | help='dataset name') 41 | parser.add_argument('--dataset_path', 42 | type=str, 43 | default="/data/data_warehouse/llm/clean_data/", 44 | help='source path') 45 | parser.add_argument('--output_path', 46 | type=str, 47 | default="./", 48 | help='source path') 49 | 50 | parser.add_argument("--number_sample", 51 | type=int, 52 | default=100000, 53 | help="number of sampled data" 54 | ) 55 | parser.add_argument('--version', 56 | type=str, 57 | default="v1", 58 | help="" 59 | ) 60 | args = parser.parse_args() 61 | return args 62 | 63 | if __name__ == '__main__': 64 | 65 | args = parse_args() 66 | sample_benchmark = [] 67 | 68 | subdirs = sorted(glob.glob(os.path.join(args.dataset_path,"cn-*"), recursive=True)) 69 | print(f"{subdirs}") 70 | for subdir in subdirs: 71 | dir_ = os.path.join(args.dataset_path,subdir) 72 | print(f"Processing {dir_}...") 73 | samples = random_sample_benchmark( 74 | sample_num=args.number_sample, 75 | input_dir=dir_ 76 | ) 77 | sample_benchmark.extend(samples) 78 | print(f"sampling benchmark questions: {len(sample_benchmark)}") 79 | 80 | output_file=f"{args.output_path}/{args.dataset_name}-sample-2k.jsonl" 81 | if os.path.exists(output_file): os.remove(output_file) 82 | fo = open(output_file, 'w', encoding='utf-8') 83 | 84 | random.shuffle(sample_benchmark) 85 | for idx, item in tqdm(enumerate(sample_benchmark),total=len(sample_benchmark)): 86 | item["id"] = idx + 1 87 | jstr = json.dumps(item, ensure_ascii=False) 88 | fo.write(jstr+"\n") 89 | fo.close() 90 | print(f"Output file {output_file}, total sampled {len(samples)}") 91 | 92 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/search_pretraindata-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import gzip 4 | import argparse 5 | import chardet 6 | from tqdm import tqdm 7 | from os import listdir, path 8 | 9 | def make_clean(args): 10 | global_file_no = 0 11 | global_id_no = 0 12 | 13 | 14 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 15 | if os.path.exists(dest_file): os.remove(dest_file) 16 | global_file_no += 1 17 | of = open(dest_file,'w',encoding='utf-8') 18 | 19 | subsets = sorted(listdir(args.source_path)) 20 | for dir_no,subset_dir in tqdm(enumerate(subsets),total=len(subsets)): 21 | 22 | if subset_dir.find("cn-") == -1: continue 23 | 24 | file_dir = os.path.join(args.source_path,subset_dir) 25 | for root, dirs, files in os.walk(file_dir): 26 | print('root_dir:', root) 27 | print('files:', files) 28 | for file in files: 29 | if not file.endswith(".jsonl"):continue 30 | input_file = os.path.join(root,file) 31 | print("input_file:",input_file) 32 | with open(input_file, 'r',encoding='utf-8') as f: 33 | for line in f: 34 | js_dict = json.loads(line) 35 | 36 | content = js_dict["content"] 37 | if content.find("时代在召唤") == -1 or content.find("长城Assistant") == -1: continue 38 | 39 | if content.find("时代在召唤") >= 0: 40 | js_dict["datatype"] = "时代在召唤" 41 | elif content.find("长城Assistant") >= 0: 42 | js_dict["datatype"] = "长城Assistant" 43 | 44 | print(json.dumps(js_dict,ensure_ascii=False),file=of) 45 | if of.tell() > args.max_size: 46 | of.close() 47 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 48 | if os.path.exists(dest_file): os.remove(dest_file) 49 | of = open(dest_file,'w',encoding='utf-8') 50 | global_file_no += 1 51 | of.close() 52 | 53 | 54 | def parse_args(): 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('--source_path', 57 | type=str, 58 | default="/llm-data-org.del/", 59 | help='Directory containing trained actor model') 60 | parser.add_argument('--dest_path', 61 | type=str, 62 | default="/localdisk/datacleaner/preprocess/", 63 | help='Directory containing trained actor model') 64 | parser.add_argument('--dataset_name', 65 | type=str, 66 | default="cn-mnbvc", 67 | help="") 68 | parser.add_argument('--max_size', 69 | type=int, 70 | default=200 * 1024 * 1024, 71 | help="max chunk size") 72 | args = parser.parse_args() 73 | return args 74 | 75 | if __name__ == "__main__": 76 | args = parse_args() 77 | 78 | if not os.path.exists(args.dest_path): 79 | os.makedirs(args.dest_path, exist_ok=True) 80 | make_clean(args) 81 | 82 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/safetycheck_random_sample.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import glob 4 | import re 5 | import math 6 | import json 7 | import argparse 8 | import random 9 | from tqdm import tqdm 10 | import hashlib 11 | 12 | def random_sample_benchmark(sample_num,input_dir): 13 | 14 | files = sorted(glob.glob(os.path.join(input_dir,"v*"), recursive=True)) 15 | good_dir = os.path.join(input_dir,files[-1],"good") 16 | 17 | input_files = sorted(glob.glob(os.path.join(good_dir,"*.jsonl"), recursive=True)) 18 | avg_nums_per_task = math.ceil(sample_num/len(input_files)) 19 | 20 | sample_1000 = [] 21 | for file in tqdm(input_files,total=len(input_files)): 22 | filename = os.path.basename(file).replace(".jsonl","") 23 | data = [] 24 | for line in open(file,"r",encoding='utf-8'): 25 | try: 26 | js = json.loads(line.strip()) 27 | except json.decoder.JSONDecodeError: 28 | print(line) 29 | data.append(js) 30 | random.shuffle(data) 31 | print(f"process file {filename}, total of {len(data)}.") 32 | sample_1000.extend(data[0:avg_nums_per_task]) 33 | return sample_1000 34 | 35 | def parse_args(): 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument('--dataset_name', 38 | type=str, 39 | default="SafetyCheck", 40 | help='dataset name') 41 | parser.add_argument('--dataset_path', 42 | type=str, 43 | default="/data/data_warehouse/llm/clean_data/", 44 | help='source path') 45 | parser.add_argument('--output_path', 46 | type=str, 47 | default="./", 48 | help='source path') 49 | 50 | parser.add_argument("--number_sample", 51 | type=int, 52 | default=100000, 53 | help="number of sampled data" 54 | ) 55 | parser.add_argument('--version', 56 | type=str, 57 | default="v1", 58 | help="" 59 | ) 60 | args = parser.parse_args() 61 | return args 62 | 63 | if __name__ == '__main__': 64 | 65 | args = parse_args() 66 | sample_benchmark = [] 67 | 68 | subdirs = sorted(glob.glob(os.path.join(args.dataset_path,"cn-*"), recursive=True)) 69 | print(f"{subdirs}") 70 | for subdir in subdirs: 71 | dir_ = os.path.join(args.dataset_path,subdir) 72 | print(f"Processing {dir_}...") 73 | samples = random_sample_benchmark( 74 | sample_num=args.number_sample, 75 | input_dir=dir_ 76 | ) 77 | sample_benchmark.extend(samples) 78 | print(f"sampling benchmark questions: {len(sample_benchmark)}") 79 | 80 | output_file=f"{args.output_path}/{args.dataset_name}-sample-2k.jsonl" 81 | if os.path.exists(output_file): os.remove(output_file) 82 | fo = open(output_file, 'w', encoding='utf-8') 83 | 84 | random.shuffle(sample_benchmark) 85 | samples = sample_benchmark[0:2000] 86 | 87 | for idx, item in tqdm(enumerate(samples),total=len(samples)): 88 | item["id"] = idx + 1 89 | jstr = json.dumps(item, ensure_ascii=False) 90 | fo.write(jstr+"\n") 91 | fo.close() 92 | print(f"Output file {output_file}, total sampled {len(samples)}") 93 | 94 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/clean_headtails_from_content.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | import os 3 | import sys 4 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 5 | from flashtext import KeywordProcessor 6 | from utils.util import load_list_from_structedTxt 7 | 8 | class CleanHeadTailsFromContent: 9 | def __init__(self, keyphrase_file, thresh_hold=5): 10 | self.ads_wechat_flashtext = KeywordProcessor() 11 | 12 | ads_phrase_list = load_list_from_structedTxt(keyphrase_file) 13 | print(f"load {len(ads_phrase_list)} ads phrases:",ads_phrase_list) 14 | self.ads_wechat_flashtext.add_keywords_from_list(ads_phrase_list) 15 | self.split_flg = ['。','\n','!','?'] 16 | self.thresh_hold = thresh_hold 17 | 18 | def clean(self,text): 19 | text = text.strip() 20 | text = self.forward(text) 21 | #print("forward:",text) 22 | text = self.backward(text) 23 | #print("backward:",text) 24 | return text 25 | 26 | def forward(self,text): 27 | 28 | prev_density = 0.0 29 | prev_idx = 0 30 | 31 | no_hit_sentence_cnt = 0 32 | hit_pos = 0 33 | 34 | tlen = len(text) 35 | while prev_idx < tlen: 36 | curr_idx = prev_idx 37 | while curr_idx < tlen and text[curr_idx] not in self.split_flg: curr_idx += 1 38 | 39 | head_text = text[prev_idx:curr_idx+1] 40 | if len(head_text) < 1: 41 | prev_idx = curr_idx + 1 42 | continue 43 | diff_cnt,keylen = self.calculate_density(head_text) 44 | #print(f"head_text:{head_text}, diff_cnt:{diff_cnt}, keylen:{keylen}") 45 | if diff_cnt < 1: 46 | prev_idx = curr_idx + 1 47 | no_hit_sentence_cnt += 1 48 | if no_hit_sentence_cnt >= self.thresh_hold: break 49 | else: 50 | hit_pos = curr_idx + 1 51 | prev_idx = curr_idx + 1 52 | no_hit_sentence_cnt = 0 53 | text = text[hit_pos:].strip() 54 | return text 55 | 56 | def backward(self,text): 57 | last_density = 0.0 58 | last_idx = len(text) - 1 59 | 60 | no_hit_sentence_cnt = 0 61 | hit_pos = last_idx 62 | 63 | while last_idx > 0: 64 | curr_idx = last_idx 65 | while curr_idx > 0 and text[curr_idx] not in self.split_flg: curr_idx -= 1 66 | 67 | tail_text = text[curr_idx+1:last_idx+1] 68 | if len(tail_text) < 1: 69 | last_idx = curr_idx - 1 70 | continue 71 | diff_cnt,keylen = self.calculate_density(tail_text) 72 | #print(f"tail_text:{tail_text}, diff_cnt:{diff_cnt}, keylen:{keylen}") 73 | if diff_cnt < 1: 74 | last_idx = curr_idx 75 | no_hit_sentence_cnt += 1 76 | if no_hit_sentence_cnt >= self.thresh_hold: break 77 | else: 78 | hit_pos = curr_idx 79 | last_idx = curr_idx 80 | no_hit_sentence_cnt = 0 81 | text = text[0:hit_pos+1].strip() 82 | return text 83 | 84 | def calculate_density(self,text): 85 | keyword_list = self.ads_wechat_flashtext.extract_keywords(text) 86 | #print("keyword_list:",keyword_list) 87 | keylen = sum([len(item) for item in keyword_list]) 88 | #ratio = 1.0*keylen / (len(text) + 0.005) 89 | diff_cnt = len(set(keyword_list)) 90 | return diff_cnt,keylen 91 | 92 | 93 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/tests/test_best_of_n_sampler.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import torch 4 | from transformers import AutoTokenizer, GenerationConfig 5 | 6 | from trl import AutoModelForCausalLMWithValueHead 7 | from trl.core import LengthSampler 8 | from trl.extras import BestOfNSampler 9 | 10 | 11 | def queries_to_scores(list_of_strings): 12 | return [torch.rand(1).item() for _ in list_of_strings] 13 | 14 | 15 | class BestOfNSamplerTester(unittest.TestCase): 16 | """ 17 | Tests the BestOfNSampler class 18 | """ 19 | 20 | ref_model_name = "trl-internal-testing/dummy-GPT2-correct-vocab" 21 | output_length_sampler = LengthSampler(2, 6) 22 | model = AutoModelForCausalLMWithValueHead.from_pretrained(ref_model_name) 23 | tokenizer = AutoTokenizer.from_pretrained(ref_model_name) 24 | tokenizer.pad_token = tokenizer.eos_token 25 | output_length_sampler = LengthSampler(2, 6) 26 | 27 | def test_different_input_types(self): 28 | r""" 29 | Tests if the different input types normalizer works 30 | """ 31 | 32 | generation_config = GenerationConfig( 33 | min_length=-1, 34 | top_k=0.0, 35 | top_p=1.0, 36 | do_sample=True, 37 | pad_token_id=self.tokenizer.eos_token_id, 38 | ) 39 | 40 | output_length_sampler = LengthSampler(2, 6) 41 | 42 | best_of_n = BestOfNSampler( 43 | self.model, 44 | self.tokenizer, 45 | queries_to_scores, 46 | length_sampler=output_length_sampler, 47 | generation_config=generation_config, 48 | ) 49 | 50 | queries = ["hello world", "goodbye world"] 51 | tokenized_queries = [self.tokenizer.encode(query) for query in queries] 52 | 53 | various_queries_formats = [ 54 | (tokenized_queries[0], 1), 55 | (tokenized_queries, 2), 56 | (torch.tensor(tokenized_queries[1]), 1), 57 | ([torch.tensor(query) for query in tokenized_queries], 2), 58 | ] 59 | 60 | for q, expected_length in various_queries_formats: 61 | results = best_of_n.generate(q) 62 | self.assertIsInstance(results, list) 63 | assert len(results) == expected_length 64 | 65 | def test_different_sample_sizes_and_n_candidates_values(self): 66 | r""" 67 | Tests different sample sizes and n_candidates values 68 | """ 69 | generation_config = GenerationConfig( 70 | min_length=-1, 71 | top_k=0.0, 72 | top_p=1.0, 73 | do_sample=True, 74 | pad_token_id=self.tokenizer.eos_token_id, 75 | ) 76 | 77 | output_length_sampler = LengthSampler(6, 10) 78 | 79 | for sample_value, n_candidates_values, expected in [ 80 | (4, 2, 2), 81 | (10, 3, 3), 82 | (6, 4, 4), 83 | ]: 84 | best_of_n = BestOfNSampler( 85 | self.model, 86 | self.tokenizer, 87 | queries_to_scores, 88 | length_sampler=output_length_sampler, 89 | generation_config=generation_config, 90 | sample_size=sample_value, 91 | n_candidates=n_candidates_values, 92 | ) 93 | 94 | queries = ["hello world", "troll the world"] 95 | tokenized_queries = [self.tokenizer.encode(query) for query in queries] 96 | results = best_of_n.generate(tokenized_queries) 97 | for result in results: 98 | assert len(result) == expected 99 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-kindle-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import chardet 5 | from tqdm import tqdm 6 | from os import listdir, path 7 | 8 | 9 | 10 | def make_clean(args): 11 | global_file_no = 0 12 | global_id_no = 0 13 | 14 | subsets = sorted(listdir(args.source_path)) 15 | for dir_no,subset_dir in tqdm(enumerate(subsets),total=len(subsets)): 16 | 17 | #subset_dir = subset_dir.replace(" ","\ ") 18 | file_dir = os.path.join(args.source_path,subset_dir) 19 | 20 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 21 | if os.path.exists(dest_file): os.remove(dest_file) 22 | global_file_no += 1 23 | of = open(dest_file,'w',encoding='utf-8') 24 | 25 | for root, dirs, files in os.walk(file_dir): 26 | print('root_dir:', root) 27 | print('files:', files) 28 | 29 | #root = root.replace(" ","\ ") 30 | for file in files: 31 | #file = file.replace(" ","\ ") 32 | if not (file.endswith(".txt") or file.endswith(".shtml")): continue 33 | input_file = os.path.join(root,file) 34 | 35 | html_str = open(input_file, 'rb').read() 36 | encoding_info = chardet.detect(html_str) 37 | original_encoding = encoding_info['encoding'] 38 | if original_encoding not in ["UTF-8","GB2312","GB18030","Big5","utf-8","UTF-16","UTF-32"]: continue 39 | 40 | html_str = html_str.decode(original_encoding, 'ignore')#.encode('utf-8') 41 | if len(html_str) < 512: continue 42 | 43 | js_dict = {} 44 | js_dict["id"] = global_id_no 45 | js_dict["source"] = "cn-kindle" 46 | js_dict["subset"] = subset_dir 47 | js_dict["source_id"] = input_file 48 | global_id_no += 1 49 | 50 | js_dict["content"] = html_str 51 | 52 | print(json.dumps(js_dict,ensure_ascii=False),file=of) 53 | if of.tell() > args.max_size: 54 | of.close() 55 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 56 | if os.path.exists(dest_file): os.remove(dest_file) 57 | of = open(dest_file,'w',encoding='utf-8') 58 | global_file_no += 1 59 | of.close() 60 | 61 | 62 | def parse_args(): 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('--source_path', 65 | type=str, 66 | default="/data/data_warehouse/llm/source_data/cn-kindle", 67 | help='Directory containing trained actor model') 68 | parser.add_argument('--dest_path', 69 | type=str, 70 | default="/data/data_warehouse/llm/source_data/cn-kindle2", 71 | help='Directory containing trained actor model') 72 | parser.add_argument('--dataset_name', 73 | type=str, 74 | default="cn-kindle", 75 | help="") 76 | parser.add_argument('--max_size', 77 | type=int, 78 | default=500 * 1024 * 1024, 79 | help="max chunk size") 80 | args = parser.parse_args() 81 | return args 82 | 83 | if __name__ == "__main__": 84 | args = parse_args() 85 | 86 | if not os.path.exists(args.dest_path): 87 | os.makedirs(args.dest_path, exist_ok=True) 88 | make_clean(args) 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/models/auxiliary_modules.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os 15 | 16 | import torch 17 | import torch.nn as nn 18 | import torchvision 19 | from huggingface_hub import hf_hub_download 20 | from huggingface_hub.utils import EntryNotFoundError 21 | from transformers import CLIPModel 22 | 23 | from trl.import_utils import is_npu_available, is_xpu_available 24 | 25 | 26 | class MLP(nn.Module): 27 | def __init__(self): 28 | super().__init__() 29 | self.layers = nn.Sequential( 30 | nn.Linear(768, 1024), 31 | nn.Dropout(0.2), 32 | nn.Linear(1024, 128), 33 | nn.Dropout(0.2), 34 | nn.Linear(128, 64), 35 | nn.Dropout(0.1), 36 | nn.Linear(64, 16), 37 | nn.Linear(16, 1), 38 | ) 39 | 40 | def forward(self, embed): 41 | return self.layers(embed) 42 | 43 | 44 | class AestheticScorer(torch.nn.Module): 45 | """ 46 | This model attempts to predict the aesthetic score of an image. The aesthetic score 47 | is a numerical approximation of how much a specific image is liked by humans on average. 48 | This is from https://github.com/christophschuhmann/improved-aesthetic-predictor 49 | """ 50 | 51 | def __init__(self, *, dtype, model_id, model_filename): 52 | super().__init__() 53 | self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") 54 | self.normalize = torchvision.transforms.Normalize( 55 | mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711] 56 | ) 57 | self.target_size = 224 58 | self.mlp = MLP() 59 | try: 60 | cached_path = hf_hub_download(model_id, model_filename) 61 | except EntryNotFoundError: 62 | cached_path = os.path.join(model_id, model_filename) 63 | state_dict = torch.load(cached_path, map_location=torch.device("cpu")) 64 | self.mlp.load_state_dict(state_dict) 65 | self.dtype = dtype 66 | self.eval() 67 | 68 | def __call__(self, images): 69 | device = next(self.parameters()).device 70 | images = torchvision.transforms.Resize(self.target_size)(images) 71 | images = self.normalize(images).to(self.dtype).to(device) 72 | embed = self.clip.get_image_features(pixel_values=images) 73 | # normalize embedding 74 | embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True) 75 | reward = self.mlp(embed).squeeze(1) 76 | return reward 77 | 78 | 79 | def aesthetic_scorer(hub_model_id, model_filename): 80 | scorer = AestheticScorer( 81 | model_id=hub_model_id, 82 | model_filename=model_filename, 83 | dtype=torch.float32, 84 | ) 85 | if is_npu_available(): 86 | scorer = scorer.npu() 87 | elif is_xpu_available(): 88 | scorer = scorer.xpu() 89 | else: 90 | scorer = scorer.cuda() 91 | 92 | def _fn(images, prompts, metadata): 93 | images = (images).clamp(0, 1) 94 | scores = scorer(images) 95 | return scores, {} 96 | 97 | return _fn 98 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-39health-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import gzip 4 | import argparse 5 | import chardet 6 | import sys 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 8 | from tqdm import tqdm 9 | from os import listdir, path 10 | from utils.general_policy import GClean 11 | 12 | _TEXT_LONG_REQUIRED_ = 10 13 | cleaner = GClean(_TEXT_LONG_REQUIRED_) 14 | 15 | def make_clean(args): 16 | global_file_no = 0 17 | global_id_no = 0 18 | 19 | jsonlfiles = sorted(listdir(args.source_path)) 20 | for dir_no,subfile in tqdm(enumerate(jsonlfiles),total=len(jsonlfiles)): 21 | 22 | dest_file = os.path.join(args.dest_path,"part-39-{:06d}.jsonl".format(global_file_no)) 23 | if os.path.exists(dest_file): os.remove(dest_file) 24 | global_file_no += 1 25 | of = open(dest_file,'w',encoding='utf-8') 26 | 27 | input_file = os.path.join(args.source_path,subfile) 28 | print("input_file:",input_file) 29 | with open(input_file, 'r',encoding='utf-8') as fin: 30 | for line in tqdm(fin): 31 | js_ = json.loads(line) 32 | ''' 33 | {"question": "唐氏筛查afp值结果是0.81----(女24岁)", "answer": "你好,唐氏筛查如果mom值偏高的话,有可能胎儿不正常。建议您进一步做无创DNA的检查。这个是相对比较准确的。唐氏筛查跟很多因素有关系,比如您填写的数值身高体,体重,末次月经。大部分怀孕的胎儿是正常的。怀孕期间每一次的检查都是排除胎儿畸形的。"} 34 | ''' 35 | js_dict = {} 36 | js_dict["id"] = global_id_no 37 | js_dict["source"] = "cn-medical-treatment" 38 | js_dict["subset"] = "39-health" 39 | js_dict["source_id"] = "" 40 | global_id_no += 1 41 | 42 | ques = js_["question"].strip() 43 | if ques[-1] not in ['。','!','?',"?",",",","]: 44 | ques = ques + "?" 45 | else: 46 | ques = ques[0:-1] + "?" 47 | answ = js_["answer"].strip() 48 | answ = cleaned_content = cleaner.clean_punct_at_begin(answ) 49 | js_dict["content"] = ques + answ 50 | 51 | print(json.dumps(js_dict,ensure_ascii=False),file=of) 52 | if of.tell() > args.max_size: 53 | of.close() 54 | dest_file = os.path.join(args.dest_path,"part-39-{:06d}.jsonl".format(global_file_no)) 55 | if os.path.exists(dest_file): os.remove(dest_file) 56 | of = open(dest_file,'w',encoding='utf-8') 57 | global_file_no += 1 58 | of.close() 59 | 60 | def parse_args(): 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument('--source_path', 63 | type=str, 64 | default="/data/data_warehouse/SourceData/39_health", 65 | help='Directory containing trained actor model') 66 | parser.add_argument('--dest_path', 67 | type=str, 68 | default="/localdisk/llm/source_data/cn-39-health", 69 | help='Directory containing trained actor model') 70 | parser.add_argument('--dataset_name', 71 | type=str, 72 | default="cn-cn-39-health", 73 | help="") 74 | parser.add_argument('--max_size', 75 | type=int, 76 | default=200 * 1024 * 1024, 77 | help="max chunk size") 78 | args = parser.parse_args() 79 | return args 80 | 81 | if __name__ == "__main__": 82 | args = parse_args() 83 | 84 | if not os.path.exists(args.dest_path): 85 | os.makedirs(args.dest_path, exist_ok=True) 86 | make_clean(args) 87 | 88 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/orpo_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from dataclasses import dataclass 15 | from typing import Dict, Optional 16 | 17 | from transformers import TrainingArguments 18 | 19 | 20 | @dataclass 21 | class ORPOConfig(TrainingArguments): 22 | r""" 23 | ORPOConfig collects all training arguments related to the [`ORPOTrainer`] class. 24 | 25 | Using [`HfArgumentParser`] we can turn this class into 26 | [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the 27 | command line. 28 | 29 | Parameters: 30 | max_length (`int`, defaults to `None`): 31 | The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. 32 | max_prompt_length (`int`, defaults to `None`): 33 | The maximum length of the prompt. This argument is required if you want to use the default data collator. 34 | max_completion_length (`int`, defaults to `None`): 35 | The maximum length of the completions. This argument is required if you want to use the default data collator and your model is an encoder-decoder. 36 | beta (`float`, defaults to 0.1): 37 | The beta factor in ORPO loss (lambda/alpha in paper/code) that is the weight of the relative loss ratio in the SFT loss. 38 | label_pad_token_id (`int`, defaults to `-100`): 39 | The label pad token id. This argument is required if you want to use the default data collator. 40 | padding_value (`int`, defaults to `None`): 41 | The padding value if it is different to the tokenizer's pad_token_id. 42 | truncation_mode (`str`, defaults to `keep_end`): 43 | The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator. 44 | generate_during_eval (`bool`, defaults to `False`): 45 | Whether to sample and log generations during evaluation step. 46 | is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`): 47 | If no model is provided, we need to know if the model_init returns an encoder-decoder. 48 | disable_dropout (`bool`, defaults to `True`): 49 | Whether or not to disable dropouts in `model`. 50 | model_init_kwargs (`Optional[Dict]`, *optional*): 51 | Dict of Optional kwargs to pass when instantiating the model from a string 52 | dataset_num_proc (`Optional[int]`, *optional*): 53 | The number of workers to use to tokenize the data. Defaults to None. 54 | """ 55 | 56 | max_length: Optional[int] = None 57 | max_prompt_length: Optional[int] = None 58 | max_completion_length: Optional[int] = None 59 | 60 | beta: float = 0.1 61 | disable_dropout: bool = True 62 | 63 | label_pad_token_id: int = -100 64 | padding_value: int = None 65 | truncation_mode: str = "keep_end" 66 | generate_during_eval: bool = False 67 | is_encoder_decoder: Optional[bool] = None 68 | 69 | model_init_kwargs: Optional[Dict] = None 70 | 71 | dataset_num_proc: Optional[int] = None 72 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/tests/test_ddpo_trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 metric-space, The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import gc 15 | import unittest 16 | 17 | import torch 18 | 19 | from trl import is_diffusers_available 20 | 21 | from .testing_utils import require_diffusers 22 | 23 | 24 | if is_diffusers_available(): 25 | from trl import DDPOConfig, DDPOTrainer, DefaultDDPOStableDiffusionPipeline 26 | 27 | 28 | def scorer_function(images, prompts, metadata): 29 | return torch.randn(1) * 3.0, {} 30 | 31 | 32 | def prompt_function(): 33 | return ("cabbages", {}) 34 | 35 | 36 | @require_diffusers 37 | class DDPOTrainerTester(unittest.TestCase): 38 | """ 39 | Test the DDPOTrainer class. 40 | """ 41 | 42 | def setUp(self): 43 | self.ddpo_config = DDPOConfig( 44 | num_epochs=2, 45 | train_gradient_accumulation_steps=1, 46 | per_prompt_stat_tracking_buffer_size=32, 47 | sample_num_batches_per_epoch=2, 48 | sample_batch_size=2, 49 | mixed_precision=None, 50 | save_freq=1000000, 51 | ) 52 | pretrained_model = "hf-internal-testing/tiny-stable-diffusion-torch" 53 | pretrained_revision = "main" 54 | 55 | pipeline = DefaultDDPOStableDiffusionPipeline( 56 | pretrained_model, pretrained_model_revision=pretrained_revision, use_lora=False 57 | ) 58 | 59 | self.trainer = DDPOTrainer(self.ddpo_config, scorer_function, prompt_function, pipeline) 60 | 61 | return super().setUp() 62 | 63 | def tearDown(self) -> None: 64 | gc.collect() 65 | 66 | def test_loss(self): 67 | advantage = torch.tensor([-1.0]) 68 | clip_range = 0.0001 69 | ratio = torch.tensor([1.0]) 70 | loss = self.trainer.loss(advantage, clip_range, ratio) 71 | self.assertEqual(loss.item(), 1.0) 72 | 73 | def test_generate_samples(self): 74 | samples, output_pairs = self.trainer._generate_samples(1, 2) 75 | self.assertEqual(len(samples), 1) 76 | self.assertEqual(len(output_pairs), 1) 77 | self.assertEqual(len(output_pairs[0][0]), 2) 78 | 79 | def test_calculate_loss(self): 80 | samples, _ = self.trainer._generate_samples(1, 2) 81 | sample = samples[0] 82 | 83 | latents = sample["latents"][0, 0].unsqueeze(0) 84 | next_latents = sample["next_latents"][0, 0].unsqueeze(0) 85 | log_probs = sample["log_probs"][0, 0].unsqueeze(0) 86 | timesteps = sample["timesteps"][0, 0].unsqueeze(0) 87 | prompt_embeds = sample["prompt_embeds"] 88 | advantage = torch.tensor([1.0], device=prompt_embeds.device) 89 | 90 | self.assertEqual(latents.shape, (1, 4, 64, 64)) 91 | self.assertEqual(next_latents.shape, (1, 4, 64, 64)) 92 | self.assertEqual(log_probs.shape, (1,)) 93 | self.assertEqual(timesteps.shape, (1,)) 94 | self.assertEqual(prompt_embeds.shape, (2, 77, 32)) 95 | loss, approx_kl, clipfrac = self.trainer.calculate_loss( 96 | latents, timesteps, next_latents, log_probs, advantage, prompt_embeds 97 | ) 98 | 99 | self.assertTrue(torch.isfinite(loss.cpu())) 100 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-sina_iask-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import gzip 4 | import argparse 5 | import chardet 6 | import sys 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 8 | from tqdm import tqdm 9 | from os import listdir, path 10 | from utils.general_policy import GClean 11 | 12 | _TEXT_LONG_REQUIRED_ = 10 13 | cleaner = GClean(_TEXT_LONG_REQUIRED_) 14 | 15 | def make_clean(args): 16 | global_file_no = 0 17 | global_id_no = 0 18 | 19 | jsonlfiles = sorted(listdir(args.source_path)) 20 | for dir_no,subfile in tqdm(enumerate(jsonlfiles),total=len(jsonlfiles)): 21 | 22 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 23 | if os.path.exists(dest_file): os.remove(dest_file) 24 | global_file_no += 1 25 | of = open(dest_file,'w',encoding='utf-8') 26 | 27 | input_file = os.path.join(args.source_path,subfile) 28 | print("input_file:",input_file) 29 | with open(input_file, 'r',encoding='utf-8') as fin: 30 | for line in tqdm(fin): 31 | js_ = json.loads(line) 32 | ''' 33 | {"question": "康宝xdr53-tvc1消毒柜使用方法", "answers": "、使用前认真检查设备运转是否正常,调节器和显示器是否“失控”。2、把洗净、抹净余水的餐具、茶具、食具按平行排列方式倒放或斜放于柜内架层上。3、关好柜门接通电源,扭动起动键。4、扭动“起动”键后,石英管开始发亮,表示消毒工作开始,消 毒结束后,自动切断电源,15分钟后才能打开门取用餐具。", "category": "生活"} 34 | {"question": "临期的香水可以买吗", "answers": "最好不要买吧,因为香水这种东西还挺耐用的,不可能快速就用完,有可能过期了也只用了一点点,小毫升的可以买,因为很快消耗掉,所以没关系,特别大的一瓶就没必要买了,买香水最好提前试香,选最喜欢的买,避开不喜欢的味道,没必要追求便宜去买的", "category": "生活"} 35 | ''' 36 | js_dict = {} 37 | js_dict["id"] = global_id_no 38 | js_dict["source"] = "cn-sina-iask" 39 | js_dict["subset"] = js_["category"].strip() 40 | js_dict["source_id"] = "" 41 | global_id_no += 1 42 | 43 | ques = js_["question"].strip() 44 | if ques[-1] not in ['。','!','?',"?",",",","]: 45 | ques = ques + "?" 46 | else: 47 | ques = ques[0:-1] + "?" 48 | answ = js_["answers"].strip() 49 | answ = cleaned_content = cleaner.clean_punct_at_begin(answ) 50 | js_dict["content"] = ques + answ 51 | 52 | print(json.dumps(js_dict,ensure_ascii=False),file=of) 53 | if of.tell() > args.max_size: 54 | of.close() 55 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 56 | if os.path.exists(dest_file): os.remove(dest_file) 57 | of = open(dest_file,'w',encoding='utf-8') 58 | global_file_no += 1 59 | of.close() 60 | 61 | def parse_args(): 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument('--source_path', 64 | type=str, 65 | default="/data/data_warehouse/SourceData/sina_iask", 66 | help='Directory containing trained actor model') 67 | parser.add_argument('--dest_path', 68 | type=str, 69 | default="/localdisk/llm/source_data/cn-sina-iask", 70 | help='Directory containing trained actor model') 71 | parser.add_argument('--dataset_name', 72 | type=str, 73 | default="cn-sina-iask", 74 | help="") 75 | parser.add_argument('--max_size', 76 | type=int, 77 | default=200 * 1024 * 1024, 78 | help="max chunk size") 79 | args = parser.parse_args() 80 | return args 81 | 82 | if __name__ == "__main__": 83 | args = parse_args() 84 | 85 | if not os.path.exists(args.dest_path): 86 | os.makedirs(args.dest_path, exist_ok=True) 87 | make_clean(args) 88 | 89 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/mnbvc_prepare-checkpoint.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import numpy as np 4 | import json 5 | import random 6 | import os 7 | import hashlib 8 | 9 | 10 | ''' 11 | {'fldStatus': 2, 'fldColumnID': 4, 'fldSubject': '蓝点Linux半年上市', 'fldContent': '','fldCreateTime': '2000-04-20 18:03:59', 'fldColumnName': '中国.com', 'fldUserID': 'liuren', 'fldName': '刘韧', 'fldView': 472, 'fldTypeID': '原创-IT', 'fldArticleID': 4, 'fldUserNum': 2} 12 | ''' 13 | def json2jsonl(): 14 | with open("./donews.18402.json","r",encoding='utf-8') as fo: data = json.load(fo) 15 | 16 | sft_data = [] 17 | idx = 0 18 | for idx,item in enumerate(data): 19 | js_dict = {} 20 | js_dict["id"] = idx + 1 21 | js_dict["source"] = "donews" 22 | js_dict["subset"] = item["fldSubject"] 23 | js_dict["source_id"] = "" 24 | js_dict["fldCreateTime"] = item["fldCreateTime"] 25 | js_dict["fldTypeID"] = item["fldTypeID"] 26 | js_dict["content"] = item["fldContent"] 27 | sft_data.append(js_dict) 28 | 29 | dest_file = os.path.join("./","donews.18402.jsonl") 30 | if os.path.exists(dest_file): os.remove(dest_file) 31 | of = open(dest_file,'w',encoding='utf-8') 32 | 33 | #random.shuffle(sft_data) 34 | for item in sft_data: 35 | print(json.dumps(item,ensure_ascii=False),file=of) 36 | of.close() 37 | print(f"writting {len(sft_data)} lines into {dest_file}") 38 | 39 | def data2mnbvc_style(input_file,output_dir): 40 | ''' 41 | { 42 | '文件名': '文件.txt', 43 | '是否待查文件': False, 44 | '是否重复文件': False, 45 | '文件大小': 1024, 46 | 'simhash': 0, 47 | '最长段落长度': 0, 48 | '段落数': 0, 49 | '去重段落数': 0, 50 | '低质量段落数': 0, 51 | '段落': [ 52 | { 53 | '行号': 1, 54 | '是否重复': False, 55 | '是否跨文件重复': False, 56 | 'md5': 'md5hash1', 57 | '内容': '这是第一段文字。' 58 | } 59 | ] 60 | } 61 | ''' 62 | global_file_no = 0 63 | dest_file = os.path.join(output_dir,"mnbvc-donews-part-{:06d}.jsonl".format(global_file_no)) 64 | if os.path.exists(dest_file): os.remove(dest_file) 65 | of = open(dest_file,'w',encoding='utf-8') 66 | 67 | for line in open(input_file,"r",encoding='utf-8'): 68 | line = line.strip() 69 | if len(line) < 5: continue 70 | js_dict = json.loads(line) 71 | 72 | js_new = {} 73 | js_new['文件名'] = js_dict["source"] 74 | js_new['是否待查文件'] = False 75 | js_new['是否重复文件'] = False 76 | js_new['文件大小'] = len(js_dict["content"]) 77 | js_new['simhash'] = '' 78 | js_new['最长段落长度'] = len(js_dict["content"]) 79 | js_new['段落数'] = 1 80 | js_new['去重段落数'] = 0 81 | js_new['低质量段落数'] = 0 82 | js_new['段落'] = [] 83 | 84 | item = {} 85 | item['行号'] = 1 86 | item['是否重复'] = False 87 | item['是否跨文件重复'] = False 88 | 89 | content = js_dict["content"] 90 | md5 = hashlib.md5(content.encode('utf-8')).hexdigest() 91 | item['md5'] = md5 92 | item['内容'] = js_dict["content"] 93 | js_new['段落'].append(item) 94 | 95 | print(json.dumps(js_new,ensure_ascii=False),file=of) 96 | if of.tell() > 20 * 1024 * 1024: 97 | of.close() 98 | dest_file = os.path.join(output_dir,"mnbvc-donews-part-{:06d}.jsonl".format(global_file_no)) 99 | if os.path.exists(dest_file): os.remove(dest_file) 100 | of = open(dest_file,'w',encoding='utf-8') 101 | global_file_no += 1 102 | 103 | of.close() 104 | 105 | 106 | if __name__ == "__main__": 107 | input_file = "../llm/clean_data/cn-donews/v1/good/donews.18402.jsonl" 108 | output_file = "../llm/clean_data/cn-donews/v1/good/" 109 | data2mnbvc_style(input_file,output_file) 110 | 111 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/sft_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from dataclasses import dataclass 15 | from typing import Dict, Optional 16 | 17 | from transformers import TrainingArguments 18 | 19 | 20 | @dataclass 21 | class SFTConfig(TrainingArguments): 22 | r""" 23 | Initialize SFTConfig. 24 | 25 | Args: 26 | dataset_text_field (`Optional[str]`): 27 | The name of the text field of the dataset, in case this is passed by a user, the trainer will automatically create a 28 | `ConstantLengthDataset` based on the `dataset_text_field` argument. Defaults to None. 29 | packing (`Optional[bool]`): 30 | Used only in case `dataset_text_field` is passed. This argument is used by the `ConstantLengthDataset` to pack the sequences 31 | of the dataset. Defaults to False. 32 | max_seq_length (`Optional[int]`): 33 | The maximum sequence length to use for the `ConstantLengthDataset` and for automatically creating the Dataset. Defaults to min of the smaller of the `tokenizer.model_max_length` and `1024`. 34 | dataset_num_proc (`Optional[int]`): 35 | The number of workers to use to tokenize the data. Only used when `packing=False`. Defaults to None. 36 | dataset_batch_size (`int`): 37 | The number of examples to tokenize per batch. If batch_size <= 0 or batch_size == None, 38 | tokenize the full dataset as a single batch. Defaults to 1000. 39 | neftune_noise_alpha (`Optional[float]`): 40 | If not `None`, this will activate NEFTune noise embeddings. This has been proven to drastically improve model performances for instruction 41 | fine-tuning. Check out the original paper here: https://huggingface.co/papers/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune 42 | model_init_kwargs: (`Optional[Dict]`, *optional*): 43 | Dict of Optional kwargs to pass when instantiating the model from a string. 44 | dataset_kwargs: (`Optional[Dict]`, *optional*): 45 | Dict of Optional kwargs to pass when creating packed or non-packed datasets 46 | eval_packing: (`Optional[bool]`, *optional*): 47 | Whether to pack the eval dataset as well. Defaults to `packing` if `None` is passed. 48 | num_of_sequences (`Optional[int]`): 49 | The number of sequences to use for the `ConstantLengthDataset`. Defaults to `1024`. 50 | chars_per_token (`Optional[float]`): 51 | The number of characters per token to use for the `ConstantLengthDataset`. Defaults to `3.6`. You can check how this is computed in the 52 | stack-llama example: 53 | [chars_token_ratio](https://github.com/huggingface/trl/blob/08f550674c553c36c51d1027613c29f14f3676a5/examples/stack_llama/scripts/supervised_finetuning.py#L53). 54 | """ 55 | 56 | dataset_text_field: Optional[str] = None 57 | packing: Optional[bool] = False 58 | max_seq_length: Optional[int] = None 59 | dataset_num_proc: Optional[int] = None 60 | dataset_batch_size: int = 1000 61 | neftune_noise_alpha: Optional[float] = None 62 | model_init_kwargs: Optional[Dict] = None 63 | dataset_kwargs: Optional[Dict] = None 64 | eval_packing: Optional[bool] = None 65 | num_of_sequences: Optional[int] = 1024 66 | chars_per_token: Optional[float] = 3.6 67 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/extras/dataset_formatting.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Callable, Literal, Optional, Union 3 | 4 | from datasets import Dataset, Value 5 | from transformers import AutoTokenizer 6 | 7 | from ..trainer.utils import ConstantLengthDataset 8 | 9 | 10 | FORMAT_MAPPING = { 11 | "chatml": [{"content": Value(dtype="string", id=None), "role": Value(dtype="string", id=None)}], 12 | "instruction": {"completion": Value(dtype="string", id=None), "prompt": Value(dtype="string", id=None)}, 13 | } 14 | 15 | 16 | def conversations_formatting_function(tokenizer: AutoTokenizer, messages_field: Literal["messages", "conversations"]): 17 | r""" 18 | return a callable function that takes in a "messages" dataset and returns a formatted dataset, based on the tokenizer 19 | apply chat template to the dataset 20 | """ 21 | 22 | def format_dataset(examples): 23 | if isinstance(examples[messages_field][0], list): 24 | output_texts = [] 25 | for i in range(len(examples[messages_field])): 26 | output_texts.append(tokenizer.apply_chat_template(examples[messages_field][i], tokenize=False)) 27 | return output_texts 28 | else: 29 | return tokenizer.apply_chat_template(examples[messages_field], tokenize=False) 30 | 31 | return format_dataset 32 | 33 | 34 | def instructions_formatting_function(tokenizer: AutoTokenizer): 35 | r""" 36 | return a callable function that takes in an "instructions" dataset and returns a formatted dataset, based on the tokenizer 37 | apply chat template to the dataset 38 | """ 39 | 40 | def format_dataset(examples): 41 | if isinstance(examples["prompt"], list): 42 | output_texts = [] 43 | for i in range(len(examples["prompt"])): 44 | converted_sample = [ 45 | {"role": "user", "content": examples["prompt"][i]}, 46 | {"role": "assistant", "content": examples["completion"][i]}, 47 | ] 48 | output_texts.append(tokenizer.apply_chat_template(converted_sample, tokenize=False)) 49 | return output_texts 50 | else: 51 | converted_sample = [ 52 | {"role": "user", "content": examples["prompt"]}, 53 | {"role": "assistant", "content": examples["completion"]}, 54 | ] 55 | return tokenizer.apply_chat_template(converted_sample, tokenize=False) 56 | 57 | return format_dataset 58 | 59 | 60 | def get_formatting_func_from_dataset( 61 | dataset: Union[Dataset, ConstantLengthDataset], tokenizer: AutoTokenizer 62 | ) -> Optional[Callable]: 63 | r""" 64 | Finds the correct formatting function based on the dataset structure. Currently supported datasets are: 65 | - `ChatML` with [{"role": str, "content": str}] 66 | - `instruction` with [{"prompt": str, "completion": str}] 67 | 68 | Args: 69 | dataset (Dataset): User dataset 70 | tokenizer (AutoTokenizer): Tokenizer used for formatting 71 | 72 | Returns: 73 | Callable: Formatting function if the dataset format is supported else None 74 | """ 75 | if isinstance(dataset, Dataset): 76 | if "messages" in dataset.features: 77 | if dataset.features["messages"] == FORMAT_MAPPING["chatml"]: 78 | logging.info("Formatting dataset with chatml format") 79 | return conversations_formatting_function(tokenizer, "messages") 80 | if "conversations" in dataset.features: 81 | if dataset.features["conversations"] == FORMAT_MAPPING["chatml"]: 82 | logging.info("Formatting dataset with chatml format") 83 | return conversations_formatting_function(tokenizer, "conversations") 84 | elif dataset.features == FORMAT_MAPPING["instruction"]: 85 | logging.info("Formatting dataset with instruction format") 86 | return instructions_formatting_function(tokenizer) 87 | 88 | return None 89 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/model_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Optional 3 | 4 | from ..core import flatten_dict 5 | 6 | 7 | @dataclass 8 | class ModelConfig: 9 | """ 10 | Arguments which define the model and tokenizer to load. 11 | """ 12 | 13 | model_name_or_path: Optional[str] = field( 14 | default=None, 15 | metadata={"help": ("The model checkpoint for weights initialization.")}, 16 | ) 17 | model_revision: str = field( 18 | default="main", 19 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 20 | ) 21 | torch_dtype: Optional[str] = field( 22 | default=None, 23 | metadata={ 24 | "help": ( 25 | "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " 26 | "dtype will be automatically derived from the model's weights." 27 | ), 28 | "choices": ["auto", "bfloat16", "float16", "float32"], 29 | }, 30 | ) 31 | trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."}) 32 | attn_implementation: Optional[str] = field( 33 | default=None, 34 | metadata={ 35 | "help": ( 36 | "Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`" 37 | ) 38 | }, 39 | ) 40 | use_peft: bool = field( 41 | default=False, 42 | metadata={"help": ("Whether to use PEFT or not for training.")}, 43 | ) 44 | lora_r: Optional[int] = field( 45 | default=16, 46 | metadata={"help": ("LoRA R value.")}, 47 | ) 48 | lora_alpha: Optional[int] = field( 49 | default=32, 50 | metadata={"help": ("LoRA alpha.")}, 51 | ) 52 | lora_dropout: Optional[float] = field( 53 | default=0.05, 54 | metadata={"help": ("LoRA dropout.")}, 55 | ) 56 | lora_target_modules: Optional[List[str]] = field( 57 | default=None, 58 | metadata={"help": ("LoRA target modules.")}, 59 | ) 60 | lora_modules_to_save: Optional[List[str]] = field( 61 | default=None, 62 | metadata={"help": ("Model layers to unfreeze & train")}, 63 | ) 64 | lora_task_type: str = field( 65 | default="CAUSAL_LM", metadata={"help": "The task_type to pass for LoRA (use SEQ_CLS for reward modeling)"} 66 | ) 67 | use_rslora: bool = field( 68 | default=False, 69 | metadata={ 70 | "help": ( 71 | "Use Rank-Stabilized LoRA (https://huggingface.co/papers/2312.03732), which sets the adapter " 72 | "scaling factor to lora_alpha/√r, instead of the original default value of `lora_alpha/r`." 73 | ) 74 | }, 75 | ) 76 | load_in_8bit: bool = field( 77 | default=False, metadata={"help": "use 8 bit precision for the base model - works only with LoRA"} 78 | ) 79 | load_in_4bit: bool = field( 80 | default=False, metadata={"help": "use 4 bit precision for the base model - works only with LoRA"} 81 | ) 82 | 83 | bnb_4bit_quant_type: Optional[str] = field( 84 | default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"} 85 | ) 86 | use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"}) 87 | 88 | def to_dict(self): 89 | output_dict = {} 90 | for key, value in self.__dict__.items(): 91 | output_dict[key] = value 92 | return flatten_dict(output_dict) 93 | 94 | def __post_init__(self): 95 | if self.load_in_8bit and self.load_in_4bit: 96 | raise ValueError("You can't use 8 bit and 4 bit precision at the same time") 97 | 98 | if isinstance(self.lora_target_modules, list) and len(self.lora_target_modules) == 1: 99 | self.lora_target_modules = self.lora_target_modules[0] 100 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/examples/research_projects/tools/calculator.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import re 17 | 18 | import numpy as np 19 | import torch 20 | from transformers import AutoTokenizer, load_tool 21 | 22 | from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, TextEnvironment 23 | 24 | 25 | def generate_data(n): 26 | """Generate random arithmetic tasks and answers.""" 27 | tasks, answers = [], [] 28 | for _ in range(n): 29 | a = np.random.randint(0, 50) 30 | b = np.random.randint(0, 50) 31 | op = np.random.choice(["-", "+", "*"]) 32 | tasks.append(f"\n\nWhat is {a} {op} {b}?") 33 | if op == "-": 34 | answers.append(a - b) 35 | elif op == "+": 36 | answers.append(a + b) 37 | else: 38 | answers.append(a * b) 39 | return tasks, answers 40 | 41 | 42 | def exact_match_reward(responses, answers=None): 43 | """Reward if generated response contains correct answer.""" 44 | rewards = [] 45 | pattern = r"Result\s*=\s*(-?\d+(?:\.\d+)?)\s*" # generated by chatGPT 46 | for response, answer in zip(responses, answers): 47 | reward = 0.0 48 | predicted_number = None 49 | match_pattern = re.findall(pattern, response) 50 | if match_pattern: 51 | predicted_number = float(match_pattern[0]) 52 | if predicted_number is not None: 53 | if np.abs(predicted_number - answer) < 0.01: 54 | reward += 1.0 55 | rewards.append(torch.tensor(reward)) 56 | return rewards 57 | 58 | 59 | # set up models 60 | model_id = "gpt2" 61 | model = AutoModelForCausalLMWithValueHead.from_pretrained(model_id) 62 | model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(model_id) 63 | tokenizer = AutoTokenizer.from_pretrained(model_id) 64 | tokenizer.pad_token = tokenizer.eos_token 65 | 66 | # system prompt 67 | prompt = """\ 68 | What is 13-3? 69 | 70 | 13-310.0 71 | 72 | Result=10 73 | 74 | What is 4*3? 75 | 76 | 4*312.0 77 | 78 | Result=12""" 79 | 80 | generation_kwargs = { 81 | "min_length": -1, 82 | "top_k": 0.0, 83 | "top_p": 1.0, 84 | "do_sample": True, 85 | "pad_token_id": tokenizer.eos_token_id, 86 | "eos_token_id": -1, 87 | "max_new_tokens": 32, 88 | } 89 | 90 | # trainer 91 | ppo_config = PPOConfig( 92 | batch_size=256, 93 | learning_rate=1.41e-5, 94 | mini_batch_size=64, 95 | log_with="wandb", 96 | ) 97 | ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer) 98 | 99 | # text env 100 | text_env = TextEnvironment( 101 | model, 102 | tokenizer, 103 | {"SimpleCalculatorTool": load_tool("ybelkada/simple-calculator")}, 104 | exact_match_reward, 105 | prompt, 106 | generation_kwargs=generation_kwargs, 107 | ) 108 | 109 | # main training loop 110 | for step in range(100): 111 | tasks, answers = generate_data(ppo_config.batch_size) 112 | queries, responses, masks, rewards, histories = text_env.run(tasks, answers=answers) 113 | train_stats = ppo_trainer.step(queries, responses, rewards, masks) 114 | 115 | response_texts = [tokenizer.decode(response) for response in responses] 116 | query_texts = [tokenizer.decode(query) for query in queries] 117 | texts = {"query": [qt.split("")[-1].strip() for qt in query_texts], "response": response_texts} 118 | ppo_trainer.log_stats(train_stats, texts, rewards, columns_to_log=["query", "response", "answer"]) 119 | ppo_trainer.save_pretrained(model_id + "-calculator") 120 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-mnbvc-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import gzip 4 | import argparse 5 | import chardet 6 | from tqdm import tqdm 7 | from os import listdir, path 8 | 9 | def make_clean(args): 10 | global_file_no = 0 11 | global_id_no = 0 12 | 13 | subsets = sorted(listdir(args.source_path)) 14 | for dir_no,subset_dir in tqdm(enumerate(subsets),total=len(subsets)): 15 | 16 | if subset_dir not in ["gov","law","news","qa"]: continue 17 | 18 | file_dir = os.path.join(args.source_path,subset_dir) 19 | 20 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 21 | if os.path.exists(dest_file): os.remove(dest_file) 22 | global_file_no += 1 23 | of = open(dest_file,'w',encoding='utf-8') 24 | 25 | for root, dirs, files in os.walk(file_dir): 26 | print('root_dir:', root) 27 | print('files:', files) 28 | for file in files: 29 | if not file.endswith(".jsonl.gz"):continue 30 | input_file = os.path.join(root,file) 31 | print("input_file:",input_file) 32 | with gzip.open(input_file, 'rt') as f: 33 | for line in f: 34 | js_ = json.loads(line) 35 | 36 | js_dict = {} 37 | js_dict["id"] = global_id_no 38 | js_dict["source"] = "cn-mnbvc" 39 | js_dict["subset"] = subset_dir 40 | js_dict["source_id"] = file 41 | global_id_no += 1 42 | 43 | if subset_dir in ["gov"]: 44 | if "文件名" in js_: 45 | js_dict["source_id"] = js_["文件名"] 46 | js_dict["content"] = '\n'.join([item["内容"] for item in js_["段落"]]) 47 | else: 48 | js_dict["source_id"] = eval(js_["meta"])["文件名"] 49 | js_dict["content"] = js_["text"] 50 | elif subset_dir in ["law"]: 51 | js_dict["source_id"] = js_["分卷名"] 52 | js_dict["content"] = js_["详情"] 53 | elif subset_dir in ["news"]: 54 | js_dict["source_id"] = os.path.basename(js_["文件名"]) 55 | js_dict["content"] = '\n'.join([item["内容"] for item in js_["段落"]]) 56 | elif subset_dir in ["qa"]: 57 | js_dict["source_id"] = js_["来源"] 58 | js_dict["content"] = js_["问"]+"\n"+js_["答"] 59 | 60 | print(json.dumps(js_dict,ensure_ascii=False),file=of) 61 | if of.tell() > args.max_size: 62 | of.close() 63 | dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no)) 64 | if os.path.exists(dest_file): os.remove(dest_file) 65 | of = open(dest_file,'w',encoding='utf-8') 66 | global_file_no += 1 67 | of.close() 68 | 69 | 70 | def parse_args(): 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument('--source_path', 73 | type=str, 74 | default="/data/data_warehouse/llm/source_data/cn-mnbvc", 75 | help='Directory containing trained actor model') 76 | parser.add_argument('--dest_path', 77 | type=str, 78 | default="/data/data_warehouse/llm/source_data/cn-mnbvc2", 79 | help='Directory containing trained actor model') 80 | parser.add_argument('--dataset_name', 81 | type=str, 82 | default="cn-mnbvc", 83 | help="") 84 | parser.add_argument('--max_size', 85 | type=int, 86 | default=200 * 1024 * 1024, 87 | help="max chunk size") 88 | args = parser.parse_args() 89 | return args 90 | 91 | if __name__ == "__main__": 92 | args = parse_args() 93 | 94 | if not os.path.exists(args.dest_path): 95 | os.makedirs(args.dest_path, exist_ok=True) 96 | make_clean(args) 97 | 98 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/tests/test_data_collator_completion_only.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import unittest 15 | 16 | import torch 17 | from transformers import AutoTokenizer 18 | 19 | from trl import DataCollatorForCompletionOnlyLM 20 | 21 | 22 | class DataCollatorForCompletionOnlyLMTester(unittest.TestCase): 23 | def test_data_collator_finds_response_template_llama2_tokenizer(self): 24 | # this should ideally be tested with meta-llama/Llama-2-7b-hf 25 | self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/dummy-GPT2-correct-vocab") 26 | self.instruction = """### System: You are a helpful assistant. 27 | 28 | ### User: How much is 2+2? 29 | 30 | ### Assistant: 2+2 equals 4""" 31 | self.instruction_template = "\n### User:" 32 | self.response_template = "\n### Assistant:" 33 | 34 | # GPT2Tokenizer: [198, 21017, 11787, 25] -> [11787, 25] 35 | # Llama2Tokenizer: [29871, 13, 2277, 29937, 4911, 29901] -> [2277, 29937, 4911, 29901] 36 | self.tokenized_instruction_w_context = self.tokenizer.encode( 37 | self.instruction_template, add_special_tokens=False 38 | )[2:] 39 | 40 | # GPT2Tokenizer: [198, 21017, 15286, 25] -> [15286, 25] 41 | # Llama2Tokenizer: [29871, 13, 2277, 29937, 4007, 22137, 29901] -> [2277, 29937, 4007, 22137, 29901] 42 | self.tokenized_response_w_context = self.tokenizer.encode(self.response_template, add_special_tokens=False)[2:] 43 | 44 | # Plain check on string 45 | self.assertIn(self.response_template, self.instruction) 46 | self.tokenized_instruction = self.tokenizer.encode(self.instruction, add_special_tokens=False) 47 | 48 | # Test the fix for #598 49 | # Pass already tokenized (w context) and truncated response_template so token_ids are like in the instruction + response 50 | self.collator = DataCollatorForCompletionOnlyLM(self.tokenized_response_w_context, tokenizer=self.tokenizer) 51 | self.collator.torch_call([self.tokenized_instruction]) 52 | 53 | # Test for PR #749 54 | # Pass already tokenized (w context) instruction and response both so token_ids are like in the instruction + response 55 | self.collator = DataCollatorForCompletionOnlyLM( 56 | self.tokenized_response_w_context, self.tokenized_instruction_w_context, tokenizer=self.tokenizer 57 | ) 58 | self.collator.torch_call([self.tokenized_instruction]) 59 | 60 | def test_data_collator_handling_of_long_sequences(self): 61 | self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/dummy-GPT2-correct-vocab") 62 | self.instruction = """### System: You are a helpful assistant. 63 | 64 | ### User: How much is 2+2? I'm asking because I'm not sure. And I'm not sure because I'm not good at math. 65 | """ 66 | self.response_template = "\n### Assistant:" 67 | # check DataCollatorForCompletionOnlyLM using response template only 68 | self.tokenized_instruction = self.tokenizer.encode(self.instruction, add_special_tokens=False) 69 | self.collator = DataCollatorForCompletionOnlyLM(self.response_template, tokenizer=self.tokenizer) 70 | encoded_instance = self.collator.torch_call([self.tokenized_instruction]) 71 | result = torch.all(encoded_instance["labels"] == -100) 72 | self.assertTrue(result, "Not all values in the tensor are -100.") 73 | 74 | # check DataCollatorForCompletionOnlyLM using response template and instruction template 75 | self.instruction_template = "\n### User:" 76 | self.collator = DataCollatorForCompletionOnlyLM( 77 | self.response_template, self.instruction_template, tokenizer=self.tokenizer 78 | ) 79 | encoded_instance = self.collator.torch_call([self.tokenized_instruction]) 80 | result = torch.all(encoded_instance["labels"] == -100) 81 | self.assertTrue(result, "Not all values in the tensor are -100.") 82 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/tests/test_iterative_sft_trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import tempfile 15 | import unittest 16 | 17 | import torch 18 | from datasets import Dataset 19 | from parameterized import parameterized 20 | from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments 21 | 22 | from trl import IterativeSFTTrainer 23 | 24 | 25 | class IterativeTrainerTester(unittest.TestCase): 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.model_id = "trl-internal-testing/dummy-GPT2-correct-vocab" 29 | cls.model = AutoModelForCausalLM.from_pretrained(cls.model_id) 30 | cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_id) 31 | cls.tokenizer.pad_token = cls.tokenizer.eos_token 32 | 33 | # get t5 as seq2seq example: 34 | model_id = "trl-internal-testing/tiny-T5ForConditionalGeneration-correct-vocab" 35 | cls.t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_id) 36 | cls.t5_tokenizer = AutoTokenizer.from_pretrained(model_id) 37 | 38 | def _init_tensor_dummy_dataset(self): 39 | dummy_dataset_dict = { 40 | "input_ids": [torch.tensor([5303, 3621]), torch.tensor([3666, 1438, 318]), torch.tensor([5303, 3621])], 41 | "attention_mask": [torch.tensor([1, 1]), torch.tensor([1, 1, 1]), torch.tensor([1, 1])], 42 | "labels": [torch.tensor([5303, 3621]), torch.tensor([3666, 1438, 318]), torch.tensor([5303, 3621])], 43 | } 44 | 45 | dummy_dataset = Dataset.from_dict(dummy_dataset_dict) 46 | dummy_dataset.set_format("torch") 47 | return dummy_dataset 48 | 49 | def _init_textual_dummy_dataset(self): 50 | dummy_dataset_dict = { 51 | "texts": ["Testing the IterativeSFTTrainer.", "This is a test of the IterativeSFTTrainer"], 52 | "texts_labels": ["Testing the IterativeSFTTrainer.", "This is a test of the IterativeSFTTrainer"], 53 | } 54 | 55 | dummy_dataset = Dataset.from_dict(dummy_dataset_dict) 56 | dummy_dataset.set_format("torch") 57 | return dummy_dataset 58 | 59 | def setUp(self): 60 | # initialize trainer 61 | self.model.train() 62 | return super().setUp() 63 | 64 | @parameterized.expand( 65 | [ 66 | ["gpt2", "tensor"], 67 | ["gpt2", "text"], 68 | ["t5", "tensor"], 69 | ["t5", "text"], 70 | ] 71 | ) 72 | def test_iterative_step_from_tensor(self, model_name, input_name): 73 | with tempfile.TemporaryDirectory() as tmp_dir: 74 | # initialize dataset 75 | if input_name == "tensor": 76 | dummy_dataset = self._init_tensor_dummy_dataset() 77 | inputs = { 78 | "input_ids": dummy_dataset["input_ids"], 79 | "attention_mask": dummy_dataset["attention_mask"], 80 | "labels": dummy_dataset["labels"], 81 | } 82 | else: 83 | dummy_dataset = self._init_textual_dummy_dataset() 84 | inputs = { 85 | "texts": dummy_dataset["texts"], 86 | "texts_labels": dummy_dataset["texts_labels"], 87 | } 88 | 89 | if model_name == "gpt2": 90 | model = self.model 91 | tokenizer = self.tokenizer 92 | else: 93 | model = self.t5_model 94 | tokenizer = self.t5_tokenizer 95 | 96 | args = TrainingArguments( 97 | output_dir=tmp_dir, 98 | per_device_train_batch_size=2, 99 | max_steps=2, 100 | ) 101 | iterative_trainer = IterativeSFTTrainer(model=model, args=args, tokenizer=tokenizer) 102 | 103 | iterative_trainer.step(**inputs) 104 | 105 | for param in iterative_trainer.model.parameters(): 106 | assert param.grad is not None 107 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/alignprop_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import warnings 4 | from dataclasses import dataclass, field 5 | from typing import Literal, Optional 6 | 7 | from ..core import flatten_dict 8 | from ..import_utils import is_bitsandbytes_available, is_torchvision_available 9 | 10 | 11 | @dataclass 12 | class AlignPropConfig: 13 | """ 14 | Configuration class for AlignPropTrainer 15 | """ 16 | 17 | # common parameters 18 | exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")] 19 | """the name of this experiment (by default is the file name without the extension name)""" 20 | run_name: Optional[str] = "" 21 | """Run name for wandb logging and checkpoint saving.""" 22 | seed: int = 0 23 | """Seed value for random generations""" 24 | log_with: Optional[Literal["wandb", "tensorboard"]] = None 25 | """Log with either 'wandb' or 'tensorboard', check https://huggingface.co/docs/accelerate/usage_guides/tracking for more details""" 26 | log_image_freq = 1 27 | """Logging Frequency for images""" 28 | tracker_kwargs: dict = field(default_factory=dict) 29 | """Keyword arguments for the tracker (e.g. wandb_project)""" 30 | accelerator_kwargs: dict = field(default_factory=dict) 31 | """Keyword arguments for the accelerator""" 32 | project_kwargs: dict = field(default_factory=dict) 33 | """Keyword arguments for the accelerator project config (e.g. `logging_dir`)""" 34 | tracker_project_name: str = "trl" 35 | """Name of project to use for tracking""" 36 | logdir: str = "logs" 37 | """Top-level logging directory for checkpoint saving.""" 38 | 39 | # hyperparameters 40 | num_epochs: int = 100 41 | """Number of epochs to train.""" 42 | save_freq: int = 1 43 | """Number of epochs between saving model checkpoints.""" 44 | num_checkpoint_limit: int = 5 45 | """Number of checkpoints to keep before overwriting old ones.""" 46 | mixed_precision: str = "fp16" 47 | """Mixed precision training.""" 48 | allow_tf32: bool = True 49 | """Allow tf32 on Ampere GPUs.""" 50 | resume_from: Optional[str] = "" 51 | """Resume training from a checkpoint.""" 52 | sample_num_steps: int = 50 53 | """Number of sampler inference steps.""" 54 | sample_eta: float = 1.0 55 | """Eta parameter for the DDIM sampler.""" 56 | sample_guidance_scale: float = 5.0 57 | """Classifier-free guidance weight.""" 58 | train_batch_size: int = 1 59 | """Batch size (per GPU!) to use for training.""" 60 | train_use_8bit_adam: bool = False 61 | """Whether to use the 8bit Adam optimizer from bitsandbytes.""" 62 | train_learning_rate: float = 1e-3 63 | """Learning rate.""" 64 | train_adam_beta1: float = 0.9 65 | """Adam beta1.""" 66 | train_adam_beta2: float = 0.999 67 | """Adam beta2.""" 68 | train_adam_weight_decay: float = 1e-4 69 | """Adam weight decay.""" 70 | train_adam_epsilon: float = 1e-8 71 | """Adam epsilon.""" 72 | train_gradient_accumulation_steps: int = 1 73 | """Number of gradient accumulation steps.""" 74 | train_max_grad_norm: float = 1.0 75 | """Maximum gradient norm for gradient clipping.""" 76 | negative_prompts: Optional[str] = "" 77 | """Comma-separated list of prompts to use as negative examples.""" 78 | truncated_backprop_rand: bool = True 79 | """Truncated Randomized Backpropation randomizes truncation to different diffusion timesteps""" 80 | truncated_backprop_timestep: int = 49 81 | """Absolute timestep to which the gradients are being backpropagated. If truncated_backprop_rand is False""" 82 | truncated_rand_backprop_minmax: tuple = (0, 50) 83 | """Range of diffusion timesteps for randomized truncated backprop.""" 84 | 85 | def to_dict(self): 86 | output_dict = {} 87 | for key, value in self.__dict__.items(): 88 | output_dict[key] = value 89 | return flatten_dict(output_dict) 90 | 91 | def __post_init__(self): 92 | if self.log_with not in ["wandb", "tensorboard"]: 93 | warnings.warn( 94 | "Accelerator tracking only supports image logging if `log_with` is set to 'wandb' or 'tensorboard'." 95 | ) 96 | 97 | if self.log_with == "wandb" and not is_torchvision_available(): 98 | warnings.warn("Wandb image logging requires torchvision to be installed") 99 | 100 | if self.train_use_8bit_adam and not is_bitsandbytes_available(): 101 | raise ImportError( 102 | "You need to install bitsandbytes to use 8bit Adam. " 103 | "You can install it with `pip install bitsandbytes`." 104 | ) 105 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/trl/trainer/cpo_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from dataclasses import dataclass 15 | from typing import Dict, Literal, Optional 16 | 17 | from transformers import TrainingArguments 18 | 19 | 20 | @dataclass 21 | class CPOConfig(TrainingArguments): 22 | r""" 23 | CPOConfig collects all training arguments related to the [`CPOTrainer`] class. 24 | 25 | Using [`HfArgumentParser`] we can turn this class into 26 | [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the 27 | command line. 28 | 29 | Parameters: 30 | max_length (`int`, defaults to `None`): 31 | The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. 32 | max_prompt_length (`int`, defaults to `None`): 33 | The maximum length of the prompt. This argument is required if you want to use the default data collator. 34 | max_target_length (`int`, defaults to `None`): 35 | The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder. 36 | beta (`float`, defaults to 0.1): 37 | The beta factor in CPO loss. 38 | label_smoothing (`float`, defaults to 0): 39 | The label smoothing factor. This argument is required if you want to use the default data collator. 40 | loss_type (`str`, defaults to `sigmoid`): 41 | The type of loss to use. This argument is required if you want to use the default data collator. 42 | label_pad_token_id (`int`, defaults to `-100`): 43 | The label pad token id. This argument is required if you want to use the default data collator. 44 | cpo_alpha (`float`, defaults to `1.0`): 45 | A hyperparameter that controls the strength of the BC regularizer in CPO training. 46 | simpo_gamma (`float`, defaults to `0.5`): 47 | A target reward margin for the SimPO loss, used only when the "simpo" option is enabled. 48 | padding_value (`int`, defaults to `None`): 49 | The padding value if it is different to the tokenizer's pad_token_id. 50 | truncation_mode (`str`, defaults to `keep_end`): 51 | The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator. 52 | generate_during_eval (`bool`, defaults to `False`): 53 | Whether to sample and log generations during evaluation step. 54 | is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`): 55 | If no model is provided, we need to know if the model_init returns an encoder-decoder. 56 | disable_dropout (`bool`, defaults to `True`): 57 | Whether or not to disable dropouts in `model`. 58 | model_init_kwargs (`Optional[Dict]`, *optional*): 59 | Dict of Optional kwargs to pass when instantiating the model from a string 60 | dataset_num_proc (`Optional[int]`, *optional*): 61 | The number of workers to use to tokenize the data. Defaults to None. 62 | """ 63 | 64 | max_length: Optional[int] = None 65 | max_prompt_length: Optional[int] = None 66 | max_completion_length: Optional[int] = None 67 | max_target_length: Optional[int] = None 68 | 69 | beta: float = 0.1 70 | label_smoothing: float = 0 71 | loss_type: Literal["sigmoid", "hinge", "ipo", "simpo"] = "sigmoid" 72 | disable_dropout: bool = True 73 | cpo_alpha: float = 1.0 74 | simpo_gamma: float = 0.5 75 | 76 | label_pad_token_id: int = -100 77 | padding_value: int = None 78 | truncation_mode: str = "keep_end" 79 | generate_during_eval: bool = False 80 | is_encoder_decoder: Optional[bool] = None 81 | 82 | model_init_kwargs: Optional[Dict] = None 83 | 84 | dataset_num_proc: Optional[int] = None 85 | 86 | def __post_init__(self): 87 | if self.loss_type == "kto_pair": 88 | raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.") 89 | return super().__post_init__() 90 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/test60.py: -------------------------------------------------------------------------------- 1 | from general_policy import GClean 2 | 3 | cleaner = GClean(50) 4 | 5 | sentence = '018骞存湯锛岃?琛屾牳 蹇冧竴绾祫鏈?厖瓒崇巼銆佷竴绾祫鏈?厖瓒崇巼鍙婅祫鏈?厖瓒崇巼鍒嗗埆涓?.54%锛?.39%鍙?1.50%锛屽潎婊冻鐩戠?杈炬爣瑕佹眰锛岃 緝涓婂勾鏈?.26銆?.21鍙?.30涓?櫨鍒嗙偣銆偂XIIa href=\"\" target=\"_blank\">聚彩网大发快3骗局:榛勬鼎涓?巻浠讳腑鍥介摱琛 屼笟鍗忎細鍏氬?濮斿憳銆佺?涔暱锛屽浗鏈夐噸鐐归噾铻嶆満鏋?浗杩涘嚭鍙i摱琛?鐩戜簨浼氬壇灞绾T笓鑱岀洃浜嬨佸贰瑙嗗憳绛夎亴 鍔備粬鎷湁鏀垮簻銆佺洃绠満鏋勩佽?涓氱浼氱粍缁囩瓑澶氫釜棰嗗煙鐨勫伐浣滅粡鍘嗭紝鐔熸倝閾惰?涓氱洃绠斂绛栵紝闀挎湡鍏虫敞 閲戣瀺绉戞妧棰嗗煙銆侞欢乐彩大发快3怎么下载鍘讳簡鍖婚櫌鍚庯紝鎴戝啀涔熸病鍥炲幓杩囬偅涓?瘯闀滅殑鎴块棿銆傜劧鍚庯紝鎴戝 氨鍥炲?浜嗐偂XII/span>。浣曠珛宄扮O锛屽湪鍒涘缓鍥介檯绉戝?腑蹇冭繖鏂归潰锛屽姏搴?杩涗竴姝姞澶c傚湪纭?鏂归潰锛岄?拰娣卞 湷闈犺繎鐨勬渤濂楀湴鍖猴紝澶T綋涓?.89骞虫柟鍏?鍔犲揩瑙勫垝锛屼富瑕佹槸娣卞湷鏂归潰閰嶅悎棣欐腐鏂归潰锛屽姞蹇?鍒掞紝鏃> 舵満鏉欢鎴愮啛浠悗鎺繘寤鸿?銆傜浠惰?鏂藉缓璁剧殑鍐嶄竴涓?柟闈一氨鏄?湪骞垮窞銆佹繁鍦冲埌棣欐腐锛屽箍宸炪佺彔娴峰埌婢抽 棬锛屽缓璁句袱鏉浗闄呮按骞崇殑绉戞妧鍒涙柊璧板粖锛岄櫎姝箣澶栵紝杩樿?闄一垱鍔為?垱鏂扮爺绌堕櫌锛岃繕鏈夊叾浠栨柟闈三殑 涓浜涙秹鍙婂埌绉戞妧鍒涙柊鏂归潰鐨勯噸澶T妇鎺?紝瑕佺户缁?鍔涙帹杩涳紝涓夊湴瀵嗗垏閰嶅悎锛屽皢浼氫骇鐢?+1+1杩滆繙澶T簬3 鐨勬晥鏋溿偂XII/span>鍦汉鎵嶆湇鍔柟闈?紝浣滀负鍥藉?棣栨壒娴峰?楂樺眰娆汉鎵嶅垱鏂板垱涓氬熀鍦帮紝缁忓紑鍖哄湪鍏浗棣栧垱 鈥滀笂绠佷笅绠皬鈥濆叏閾炬潯浜烘墠鏈嶅姟妯紡锛岀洰鍓嶅尯鍐呰仛闆嗗悇绫婚珮灞傛?浜烘墠鎬绘暟浣嶅眳骞垮窞甯傚悇鍖虹?涓銆 佸箍涓滅渷鍓嶅垪锛屽尯鍩熷紩鎵嶈仛鎵嶅憟鐜板浗闄呭寲銆佺郴缁熷寲鐨勬佸娍銆傛帴涓嬫潵锛岀粡寮鍖哄皢杩涗竴姝繁鍖栤滀笂绠 佷笅绠皬鈥濈殑鍏摼鏉汉鎵嶆湇鍔a寮忥紝鎵撻氫汉鎵嶅垱鏂板垱涓氥佸眳浣忋佺敓娲荤殑鈥滄渶鍚庝竴鍏?' 6 | 7 | sentence2 = '浴室玻璃隔断+固定杆","content":"看了好多小红书的浴室隔断都好好看就也想要一个无边框的,但是沟通出错了原来想要的是靠墙用卡扣的结果不知道怎么变成这种卡槽了,不过也还可以。横杆是玻璃厂老板来装的时候说我玻璃太大了不安全,卡槽返工的时候自己又拿了几种玻璃固定杆让我选非给我装的(不要钱),不然只固定墙上一面地上打胶就会很晃。这种和三角形固定的我还是选了这个,好歹还能挂点东西。装之前老板问了好几遍你家有没有小孩,要是有小孩子无边框的这个不可以。我这边没有小孩子,有小孩也不会用这个卫生间所以完全没问题。买的时候销售都不会说,不会说家里有小孩不要选无边框,玻璃大需要固定之类的。' 8 | 9 | sentence3 = '广州峰帆贸易有限公司 是一家集礼品策划、设计、开发、生产、销售、服务于一体的专业礼品企业。公司经营的产品包括:广州商务礼品、周年庆礼品、年会 礼品、节日礼品、促销礼品、广告宣传礼品、积分兑换礼品、员工福利礼品、特色礼品等系列产品。有礼品方面需求的广大客户,我们 将免费为您提供礼品策划,设计方案,以优质,全面的服务于广大客户, 欢迎您与我们联系!广州峰帆贸易有公司秉着\"务实进取\"的 企业宗旨,以诚信为本,开拓创新的精神,以追求卓越品质,提供优质服务的理念,通过现代化的管理打 造了优秀团队,为客户提供> 卓越的产品和优质的服务是我们不断追求的目标。我们以市场和客户的需求为导向,紧跟国际国内潮流,精选国 内外优质厂商达成战> 略联盟,成为专业礼品团购代理经销体系,为客户提供更多时尚、新奇、特色、实用并物美价优的礼品。上一篇: 广州迪欣贸易有限> 公司下一篇:湖南醴陵红官窑瓷业有限公司 成功签约广东省物流行业协会纯定制网站服务合作! 恭喜八爪鱼网络与省电信工程(成> 立于1950年)签约合作! 恭喜八爪鱼网络与美视晶莹(银幕行业世界前二)签约合作 恭喜八爪鱼网络与中标数据(证券代码:87070 8)签约合作! 恭喜八爪鱼网络与华南理工大学成功签约网站建设 做网站 建网站 小程序开发 网站制作 企业网站建设 广州网站建> 设 网站 有限公司 广州 成功案例 客户 广东 广州市 第一次 服务好 品牌 生物科技 事务所 家具 官网 律师 首饰 集团 核心 后台 代码 珠宝首饰 鑫诺 旅游观光 国际 服装纺织 公司 电气 设备 八爪鱼网络 美容化妆 装饰工程 空间设计 准备工作 前期 广告创意 钟表 美斯 鱼网 安防 装饰设计 有保障 网站设计 尼曼 餐饮管理 汽配 电子 吸引 工程 深圳市 教育培训 金融投资 酒店管理 生物> 医药 汽车 电力 物流运输 家居 电子电器 节能环保 金融 乐享 服装 汉光 医药 建材 机械设备 房地产业 食品 技术 旅行社 祖诺 > 餐饮 品牌策划 礼品盒 制造厂 很省心 优网站 精艺 玛雅 通讯 厦门 网页 彩印 智能 展览设计 建筑工程 设计公司 法律师 新闻 上 海互联网 企业管理 网络技术 艺术 携程 服饰 设计师 包满意 商城 值得过 浙江 银饰品 曼古 集团公司 红谷 伊顿 前端 格兰 幸福 西饼 互联网+ 百益 制衣厂 中山 大象 程序员 信得过 德马吉 力天 全案策划 朗昇 麦睿仕 四川 营销策划 广告设计 服务,服务 向 日葵 经理 知识产权 代表 中新 大陆 软银 实体商业 购物商城 手机 外贸 网络营销 企业 八鱼网 广告公司 阿里 华为 全球最具品> 牌价值百强 时间 趋势 捷达 优派 官窑 瓷业 企业邮箱 服务器 体育 空间租用 域名注册 平台 商家 日用品 皮具 超音速 电缆 消防 设备 德科 仪器 芬尼 东莞市 精密机械 实验设备 新材料 医疗 东津 尔曼 天使 珠宝 米莱 养堂 产业 科方 生物技术 鼎科 宝莎曼 首饰珠宝 数控设备 机具 可卡 成都 信息 恒爱 照明设备 轻工业 广东省 菲达 技工学校 中国留学生 我们能提供什么 夸克 家私 木 业 金融服务 兴隆 食品工业 研究所 陈记 投资管理 科创 极至 创意 凤凰 中国 企业顾问 第一次做网站、对网站不了解? 旧网站改 版、对网站有初步认识了? 我应该做个什么网站?您可以填写右边的表格,让我们了解您的项目需求,这是一个良好的开始,我们将> 会尽快与你取得联系。也欢迎您给我们打电话,让我们马上进行沟通吧!' 10 | 11 | sentence4 = '山东省菏泽市中级人民法 院刑 事 裁 定 书(2017)鲁17刑更967号罪犯袁德朋,男,汉族,1980年1月2日出生于山东省曹县。现在山东省菏泽监狱服刑。> 二〇一五年二月十日,本院作出(2014)菏刑一初字第11号刑事附带民事判决,以被告人袁德朋犯故意伤害罪,判处有期徒刑十五年 。宣判后,被告人不服提出上诉。二一五年六月十一日,山东省高级人民法院作出(2015)鲁刑三终字第44号刑事附带民事裁定,驳 回上诉,维持原判。宣判后交付执行。执行机关山东省菏泽监狱于2017年11月22日提出减刑建议书,报送本院审理。本院依法组成 合议庭进行了审理。本案现已审理终结。执行机关山东省菏泽监狱,以罪犯袁德朋在服刑期间能认罪悔罪;认真遵守法律法规及监> 规;接受教育改造;积极参加思想、文化、职业技术教育;积极参加劳动,努力完成劳动任务,确有悔改表现等为由,提出予以减 刑建议。并附罪犯袁德朋在服刑期间的表现、奖励记录等书证。经审理查明,罪犯袁德朋在山东省菏泽监狱服刑改造期间认罪悔罪 ;认真遵守法律法规及监规;接受教育改造;积极参加思想、文化、职业技术教育;积极参加劳动,努力完成劳动任务。曾获表扬 四次;被评为2016年度监区级罪犯改造积极分子。另查明,该犯在服刑期间主动履行民事赔偿20873.7元。上述事实,有罪犯奖励> 审批表、计分考核明细表、领款条等证据予以证实。本院认为,罪犯袁德朋在服刑期间确有悔改表现,符合减刑条件。并结合其犯 罪的性质、具体情节、社会危害程度、原判刑罚及财产性判项的履行情况及交付执行后的一贯表现等因素,依照《中华人民共和国 刑事诉讼法》第二百六十二条第二款,《中华人民共和国刑法》第七十九条、第七十八条之规定,裁定如下:对罪犯袁德朋减去有 期徒刑八个月的刑罚执行。(刑期自2013年7月30日起至2027年11月29日止)本裁定送达后即发生法律效力。' 12 | 13 | sentence5 = '吉林省蛟河市人民法院执 行 裁 定 书(2020)吉0281执402号被执行人:马波,男,汉族,37岁。被执行人马波罚金执行一案,本院作出的(2020)吉0281> 刑初78号刑事判决书,主文如下:一、被告人马波犯诈骗罪,判处有期徒刑二年二个月,并处罚金人民币二万元。(刑期从判决执 行之日起计算。判决执行前先行羁押的,羁押一日折抵刑期一日,即自2018年2月15日起至2020年4月14日止。罚金限于本判决生效 后三十日内缴纳。)二、追缴被告人违法所得34560元,返还被害人。该判决书已经发生法律效力,本院于2020年5月6日立案执行> ,要求被执行人马波履行生效法律文书中确定的义务。本院在执行过程中,分别对被执行人马波采取了如下措施:对被执行人马波 通过判决书中所确认的地址向被执行人送达了执行通知书、报告财产令、限制消费令等法律手续,对被执行人在金融部门的存款进 行了调查,未发现存款。通过查询被执行人名下的机动车辆,无车辆登记信息。通过对被执行人不动产调查,被执行人名下无不动 产登记信息。被执行人马波手机号码已为空号,同时新冠病毒疫情爆发,蛟河地区疫情严重,暂时无法下乡寻找。鉴于被执行人马 波没有主动对民事裁判涉财产部分履行,故本院已向被执行人发出限制消费令,对其今后的行为予以限制。本院认为:在本院穷尽 执行措施后,暂未发现被执行人有可供执行的财产,可以认定被执行人暂不具备履行生效法律文书确定的法律义务的能力。本院已 向申请执行人告知上诉执行情况。依照《最高人民法院关于适用的解释》第五百一十九条、最高人民法院《关于严格规范终结本次 执行程序的规定(试行)》第七条之规定,裁定如下:终结本次执行程序。终结本次执行程序后,待发现被执行人有可供执行财产 的,可以向本院申请恢复执行。再次申请不受申请执行时效期间的限制。本裁定送达后即发生法律效力。如不服本裁定,可自本裁 定书送达之日起十日内,向本院提出书面执行异议。' 14 | 15 | print(cleaner.common_zhLessThan20(sentence4)) 16 | print(cleaner.common_zhLessThan20(sentence5)) 17 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import glob 4 | import json 5 | import argparse 6 | from tqdm import tqdm 7 | import multiprocessing as mp 8 | import transformers 9 | 10 | tokenizer_kwargs = { 11 | "use_fast": True, 12 | "revision": "xbGPT" 13 | } 14 | tokenizer_path="/mnt/public/open_source_AI/Meta-Llama-3.1-8B-Instruct" 15 | tokenizer = transformers.AutoTokenizer.from_pretrained("/mnt/public/open_source_AI/Meta-Llama-3.1-8B-Instruct") 16 | tokenizer.pad_token = tokenizer.eos_token 17 | 18 | def jobj2count(jobj): 19 | """ 20 | mp process controller 21 | """ 22 | for itm in tqdm(jobj): 23 | yield itm 24 | 25 | def process_file(js): 26 | global tokenizer 27 | num_tokens = 0 28 | text = ' '.join(js['data']).strip() 29 | tokens = tokenizer.encode(text,add_special_tokens=False)#(js['content']) 30 | num_tokens += len(tokens) 31 | return {'num_tokens': num_tokens, "score": float(js['score'])} 32 | 33 | 34 | def llama_tokenizer(args): 35 | input_dir = args.dataset_path 36 | src_files = sorted(glob.glob(os.path.join(input_dir, "*.jsonl"), recursive=True)) 37 | print(f"src_files: {src_files}") 38 | 39 | pool = mp.Pool(args.num_workers) 40 | total_tokens = 0 41 | 42 | records = {} 43 | records["files"] = [] 44 | 45 | for idx,xfile in tqdm(enumerate(src_files),total=len(src_files)): 46 | 47 | tokens = 0 48 | difficulty = 0.0 49 | filename = os.path.basename(xfile)#.replace(".jsonl","") 50 | print(f"process file: {filename}") 51 | 52 | with open(xfile,"r",encoding='utf-8') as fin: 53 | line_content = [json.loads(line) for line in fin.readlines()] 54 | for res in pool.imap(process_file, jobj2count(line_content)): 55 | tokens += res['num_tokens'] 56 | difficulty += res['score'] 57 | 58 | print(f'file {filename} has {tokens} tokens, {difficulty} difficulty scores.') 59 | records["files"].append( 60 | { 61 | "filename":filename, 62 | "llama_tokens":tokens, 63 | "difficulty_scores":difficulty, 64 | "total_samples":len(line_content), 65 | "avg_tokens_per_sample":1.0*tokens/len(line_content), 66 | "avg_difficulty_score_per_sample":1.0*difficulty/len(line_content), 67 | } 68 | ) 69 | total_tokens += tokens 70 | records["total_llama_tokens"] = total_tokens 71 | return records 72 | 73 | def parse_args(): 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument('--dataset_name', 76 | type=str, 77 | default="jdItem", 78 | help='dataset name') 79 | parser.add_argument('--dataset_path', 80 | type=str, 81 | default="/data_warehouse/llm/source_data/JDItem_pattern_dataset/SampledRawDataset/", 82 | help='source path') 83 | parser.add_argument('--output_path', 84 | type=str, 85 | default="/data_warehouse/llm/source_data/JDItem_pattern_dataset/", 86 | help='source path') 87 | 88 | parser.add_argument('--tokenizer_path', 89 | type=str, 90 | default="/xxxx/chinese_llama_13b_plus84", 91 | help="tokenizer path, default LLaMA tokenizer") 92 | parser.add_argument('--version', 93 | type=str, 94 | default="v1", 95 | help="" 96 | ) 97 | parser.add_argument('--num_workers', 98 | type=int, 99 | default=32, 100 | help="") 101 | args = parser.parse_args() 102 | return args 103 | 104 | if __name__ == '__main__': 105 | 106 | tokenizer_kwargs = { 107 | "use_fast": True, 108 | "revision": "productGPT" 109 | } 110 | 111 | args = parse_args() 112 | records = {} 113 | 114 | #tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path, **tokenizer_kwargs) 115 | #tokenizer.pad_token = tokenizer.eos_token 116 | print(f"num of llama tokens: {tokenizer.vocab_size}") 117 | 118 | records = llama_tokenizer(args) 119 | records['dataset'] = args.dataset_name 120 | 121 | output_file = os.path.join(args.output_path,"{}-meta-info-{}.json".format(args.dataset_name,args.version)) 122 | if os.path.exists(output_file): os.remove(output_file) 123 | with open(output_file, 'w') as f: 124 | json.dump(records, f, indent=4) 125 | 126 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/preprocess/preprocess_cn-wechat.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | #import jieba_fast as jieba 5 | #import gzip 6 | import argparse 7 | #import chardet 8 | from tqdm import tqdm 9 | from os import listdir, path 10 | 11 | def get_head_tail_sentence(args): 12 | global_file_no = 0 13 | global_id_no = 0 14 | 15 | dest_file = os.path.join(args.dest_path,"wechat_content_sentences.txt") 16 | if os.path.exists(dest_file): os.remove(dest_file) 17 | of = open(dest_file,'w',encoding='utf-8') 18 | 19 | subsets = sorted(listdir(args.source_path)) 20 | for dir_no,file_name in tqdm(enumerate(subsets),total=len(subsets)): 21 | 22 | input_file = os.path.join(args.source_path,file_name) 23 | with open(input_file, 'r',encoding='utf-8') as f: 24 | for line in f: 25 | line = line.strip() 26 | if len(line) < 100:continue 27 | js_dict = json.loads(line) 28 | content = js_dict["content"].strip() 29 | if len(content) < 100: continue 30 | 31 | ''' 32 | split_flg = [',',';','。',',',';','。','!','?',' ','\n','\t'] 33 | 34 | fpos = 1 35 | while fpos < len(content) and content[fpos] not in split_flg: fpos += 1 36 | head = content[0:fpos] 37 | 38 | lpos = len(content) - 1 -1 39 | while lpos > 0 and content[lpos] not in split_flg: lpos -= 1 40 | tail = content[lpos+1:] 41 | ''' 42 | head = content[50:len(content)-50] 43 | #if len(head) > args.topk: head = head[:args.topk] 44 | #if len(tail) > args.topk: tail = tail[-args.topk:] 45 | print(head,file=of) 46 | #if tail != head: print(tail,file=of) 47 | of.close() 48 | 49 | def text_segment(args): 50 | # /root/llm/source_data/wechat_head_tail_sentences.txt 51 | dest_file = os.path.join("/root/llm/source_data/","wechat_head_tail_sentences_segment.txt") 52 | if os.path.exists(dest_file): os.remove(dest_file) 53 | of = open(dest_file,'w',encoding='utf-8') 54 | 55 | with open("/root/llm/source_data/wechat_head_tail_sentences.txt", 'r',encoding='utf-8') as f: 56 | for line in f: 57 | line = line.strip() 58 | if len(line) < 3:continue 59 | seg_list = jieba.cut(line,cut_all=False) 60 | text = ' '.join([item for item in seg_list if len(item) > 1]) 61 | print(text,file=of) 62 | of.close() 63 | 64 | def extract_keyphrase(args): 65 | keyphrse_dict = dict() 66 | 67 | idx = 0 68 | with open("/root/llm/source_data/phrases.txt",'r') as f: 69 | for line in tqdm(f): 70 | line = line.strip() 71 | if len(line) < 1:continue 72 | tokens = line.split("\t") 73 | if len(tokens) != 3: 74 | print("tokens:",tokens) 75 | continue 76 | phrase = tokens[1].replace("_","") 77 | if phrase not in keyphrse_dict: 78 | keyphrse_dict[phrase] = [1,tokens[2]] 79 | else: 80 | keyphrse_dict[phrase][0] = keyphrse_dict[phrase][0] + 1 81 | idx += 1 82 | #if idx > 50000: break 83 | # 84 | keyphrse_list = sorted(keyphrse_dict.items(), key = lambda kv:(kv[1], kv[0]),reverse = True) 85 | for item in keyphrse_list: 86 | # ('眼下正是', [1, '102.464']) 87 | freq = item[1][0] 88 | muinfo = item[1][1] 89 | phrase = item[0] 90 | #if freq < 100: continue 91 | print(f"{phrase}\t{freq}\t{muinfo}") 92 | 93 | def parse_args(): 94 | parser = argparse.ArgumentParser() 95 | parser.add_argument('--source_path', 96 | type=str, 97 | default="/data/data_warehouse/llm/source_data/cn-wechat", 98 | help='Directory containing trained actor model') 99 | parser.add_argument('--dest_path', 100 | type=str, 101 | default="/root/llm/source_data/", 102 | help='Directory containing trained actor model') 103 | parser.add_argument('--dataset_name', 104 | type=str, 105 | default="cn-wechat", 106 | help="") 107 | parser.add_argument('--topk', 108 | type=int, 109 | default=20, 110 | help="max chunk size") 111 | args = parser.parse_args() 112 | return args 113 | 114 | if __name__ == "__main__": 115 | args = parse_args() 116 | 117 | if not os.path.exists(args.dest_path): 118 | os.makedirs(args.dest_path, exist_ok=True) 119 | #get_head_tail_sentence(args) 120 | #text_segment(args) 121 | extract_keyphrase(args) 122 | 123 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/special_policy.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | import os 3 | import re 4 | import sys 5 | import numpy as np 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 7 | 8 | class SpecialPolicies(): 9 | def __init__(self,): 10 | pass 11 | 12 | @staticmethod 13 | def IsChatperText(text,threashold=10,thresh_ratio=0.25): 14 | 15 | if len(text) < 1: return False 16 | 17 | first_num = len(re.findall(r'第[0-9一二三四五六七八九十百千万壹贰叁肆伍陆柒捌玖拾佰仟]+章', text)) 18 | second_num = len(re.findall(r'([0-9一二三四五六七八九十百千万壹贰叁肆伍陆柒捌玖拾佰仟]+)', text)) 19 | 20 | if first_num > 10 or second_num > 10: return True 21 | 22 | first_num = 0 23 | second_num = 0 24 | for item in re.findall(r'第[0-9一二三四五六七八九十百千万壹贰叁肆伍陆柒捌玖拾佰仟]+章', text): 25 | first_num += len(item) 26 | for item in re.findall(r'([0-9一二三四五六七八九十百千万壹贰叁肆伍陆柒捌玖拾佰仟]+)', text): 27 | second_num += len(item) 28 | 29 | frist_ratio = 1.0*first_num / len(text) 30 | second_ratio = 1.0*second_num / len(text) 31 | 32 | if frist_ratio > thresh_ratio or second_ratio > thresh_ratio: return True 33 | 34 | return False 35 | 36 | @staticmethod 37 | def RemoveReference(text): 38 | # 这种行为不合情理pp=66–67, 70。 39 | # 开创了塞萨洛尼基王国pp=62–63。 40 | 41 | #text = re.sub(r"p{1,2}=\d+–*\d*,*\s*\d*–*\d*","",text) 42 | regex = re.compile(r"p{1,2}=(\d+–*\d*,*\s*)+") 43 | text = regex.sub("",text) 44 | return text 45 | 46 | @staticmethod 47 | def RemoveLastLineBreak(text): 48 | text = text.strip().strip("\n").strip() 49 | return text 50 | 51 | @staticmethod 52 | def RemoveHeadWords(text): 53 | head_words = ["概述","图片发自简书app","[转载]"] 54 | for item in head_words: 55 | text = text.lstrip(item) 56 | text = text.strip() 57 | return text 58 | 59 | 60 | @staticmethod 61 | def RemoveSpamFromContent(text,spam): 62 | regex = re.compile(spam) 63 | text = regex.sub("",text) 64 | text = text.strip() 65 | return text 66 | 67 | @staticmethod 68 | def RemoveAllReference(text): 69 | # 参考文献: 70 | regex = re.compile(r"参考文献[::].*") 71 | text = regex.sub("",text) 72 | text = text.strip() 73 | return text 74 | 75 | @staticmethod 76 | def delete_like_collect_comment(sentence): 77 | ''' 78 | For ods_zdm_detail, match and remove 点赞 收藏 评论 79 | ''' 80 | like = re.compile(r'\d*点赞') 81 | collect = re.compile('\d*收藏') 82 | comment = re.compile(r'\d*评论') 83 | sent = like.sub('', sentence) 84 | sent = collect.sub('', sent) 85 | sent = comment.sub('', sent) 86 | return sent 87 | 88 | @staticmethod 89 | def delete_author_claim(sentence): 90 | ''' 91 | For ods_zdm_detail, match and remove 作者声明xxxx 92 | ''' 93 | pattern = re.compile(r'作者声明.*|本文商品由什么.*|小编注.*|以上是.*分享.*|全文完.*|(感谢|谢谢).*(众测|测评|机会|值友).*|我是.*|(链接|商品链接).*?(去购买|去看看)|未经授权,不得转载.*|本文[^。]*.$|\|赞\d.*|The.{0,1}End.*') 94 | return pattern.sub('', sentence) 95 | 96 | @staticmethod 97 | def detect_lottery(sentence): 98 | ''' 99 | For ods_zdm 100 | ''' 101 | pattern = re.compile(r'(获奖|有奖).*活动') 102 | if pattern.search(sentence): 103 | return False 104 | else: 105 | return sentence 106 | 107 | # 2023-08-16 108 | @staticmethod 109 | def RemovewechatID(text): 110 | # 参考文献: 111 | regex = re.compile(r"微信.{0,5}[a-zA-Z_][-_a-zA-Z0-9]{5,19}") 112 | text = regex.sub("",text) 113 | text = text.strip() 114 | return text 115 | @staticmethod 116 | def RemoveAllUnicode(text): 117 | # Unicode 编码 like <200a> <200b>: 118 | regex = re.compile(r"<[0-f]{4}>") 119 | text = regex.sub("",text) 120 | text = text.strip() 121 | return text 122 | 123 | @staticmethod 124 | def is_mixed_ENCN(text): 125 | def is_chinese(char) -> bool: 126 | return char.isdigit() or ('\u4e00' <= char <= '\u9fa5') or char in ['\u3002','\uff1b','\uff0c','\uff1a','\u201c','\u201d','\uff08','\uff09','\u3001','\uff1f','\u300a','\u300b'] 127 | def is_mixed_seq(seq): 128 | sub = np.array(list(map(is_chinese, seq)), dtype=int) 129 | if np.sum(np.abs(sub[1:]-sub[:-1]))>=6: 130 | # print('ilegal:', seq) 131 | return True 132 | return False 133 | sample_num = 10 if len(text)>70 else len(text)//10 134 | starts = np.linspace(0, len(text)-7, num=sample_num, endpoint=True, dtype=int) 135 | for start in starts: 136 | if is_mixed_seq(text[start:start+7]): 137 | return True 138 | return False 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/requirements/conda_dpo_requirements.txt: -------------------------------------------------------------------------------- 1 | name: dpo 2 | channels: 3 | - defaults 4 | dependencies: 5 | - _libgcc_mutex=0.1=main 6 | - _openmp_mutex=5.1=1_gnu 7 | - bzip2=1.0.8=h5eee18b_6 8 | - ca-certificates=2024.7.2=h06a4308_0 9 | - ld_impl_linux-64=2.38=h1181459_1 10 | - libffi=3.4.4=h6a678d5_1 11 | - libgcc-ng=11.2.0=h1234567_1 12 | - libgomp=11.2.0=h1234567_1 13 | - libstdcxx-ng=11.2.0=h1234567_1 14 | - libuuid=1.41.5=h5eee18b_0 15 | - ncurses=6.4=h6a678d5_0 16 | - openssl=3.0.14=h5eee18b_0 17 | - pip=24.2=py310h06a4308_0 18 | - python=3.10.13=h955ad1f_0 19 | - readline=8.2=h5eee18b_0 20 | - setuptools=72.1.0=py310h06a4308_0 21 | - sqlite=3.45.3=h5eee18b_0 22 | - tk=8.6.14=h39e8969_0 23 | - wheel=0.43.0=py310h06a4308_0 24 | - xz=5.4.6=h5eee18b_1 25 | - zlib=1.2.13=h5eee18b_1 26 | - pip: 27 | - accelerate==0.28.0 28 | - aiohappyeyeballs==2.3.5 29 | - aiohttp==3.10.3 30 | - aiosignal==1.3.1 31 | - annotated-types==0.7.0 32 | - anyio==4.4.0 33 | - async-timeout==4.0.3 34 | - attrs==24.2.0 35 | - cachetools==5.4.0 36 | - certifi==2024.7.4 37 | - charset-normalizer==3.3.2 38 | - click==8.1.7 39 | - cloudpickle==3.0.0 40 | - cmake==3.30.2 41 | - datasets==2.21.0 42 | - deepspeed==0.14.5 43 | - dill==0.3.8 44 | - diskcache==5.6.3 45 | - distro==1.9.0 46 | - docker-pycreds==0.4.0 47 | - docstring-parser==0.16 48 | - einops==0.8.0 49 | - exceptiongroup==1.2.2 50 | - fastapi==0.112.0 51 | - fastchat==0.1.0 52 | - filelock==3.15.4 53 | - flash-attn==2.6.3 54 | - frozenlist==1.4.1 55 | - fsspec==2024.6.1 56 | - gitdb==4.0.11 57 | - gitpython==3.1.43 58 | - h11==0.14.0 59 | - hjson==3.1.0 60 | - httpcore==1.0.5 61 | - httptools==0.6.1 62 | - httpx==0.27.0 63 | - huggingface-hub==0.24.5 64 | - idna==3.7 65 | - interegular==0.3.3 66 | - jinja2==3.1.4 67 | - jiter==0.5.0 68 | - joblib==1.4.2 69 | - jsonschema==4.23.0 70 | - jsonschema-specifications==2023.12.1 71 | - lark==1.2.2 72 | - llvmlite==0.43.0 73 | - lm-format-enforcer==0.10.1 74 | - loguru==0.7.2 75 | - markdown-it-py==3.0.0 76 | - markupsafe==2.1.5 77 | - mdurl==0.1.2 78 | - mpmath==1.3.0 79 | - msgpack==1.0.8 80 | - multidict==6.0.5 81 | - multiprocess==0.70.16 82 | - nest-asyncio==1.6.0 83 | - networkx==3.3 84 | - ninja==1.11.1.1 85 | - numba==0.60.0 86 | - numpy==1.26.4 87 | - nvidia-cublas-cu12==12.1.3.1 88 | - nvidia-cuda-cupti-cu12==12.1.105 89 | - nvidia-cuda-nvrtc-cu12==12.1.105 90 | - nvidia-cuda-runtime-cu12==12.1.105 91 | - nvidia-cudnn-cu12==8.9.2.26 92 | - nvidia-cufft-cu12==11.0.2.54 93 | - nvidia-curand-cu12==10.3.2.106 94 | - nvidia-cusolver-cu12==11.4.5.107 95 | - nvidia-cusparse-cu12==12.1.0.106 96 | - nvidia-ml-py==12.535.161 97 | - nvidia-nccl-cu12==2.20.5 98 | - nvidia-nvjitlink-cu12==12.6.20 99 | - nvidia-nvtx-cu12==12.1.105 100 | - nvitop==1.3.2 101 | - openai==1.40.6 102 | - outlines==0.0.34 103 | - packaging==24.1 104 | - pandas==2.2.2 105 | - platformdirs==4.2.2 106 | - prometheus-client==0.20.0 107 | - prometheus-fastapi-instrumentator==7.0.0 108 | - protobuf==5.27.3 109 | - psutil==6.0.0 110 | - py-cpuinfo==9.0.0 111 | - pyarrow==17.0.0 112 | - pycryptodome==3.20.0 113 | - pydantic==2.8.2 114 | - pydantic-core==2.20.1 115 | - pygments==2.18.0 116 | - python-dateutil==2.9.0.post0 117 | - python-dotenv==1.0.1 118 | - pytz==2024.1 119 | - pyyaml==6.0.2 120 | - ray==2.34.0 121 | - referencing==0.35.1 122 | - regex==2024.7.24 123 | - requests==2.32.3 124 | - rich==13.7.1 125 | - rpds-py==0.20.0 126 | - safetensors==0.4.4 127 | - scipy==1.14.0 128 | - sentencepiece==0.2.0 129 | - sentry-sdk==2.13.0 130 | - setproctitle==1.3.3 131 | - shtab==1.7.1 132 | - six==1.16.0 133 | - smmap==5.0.1 134 | - sniffio==1.3.1 135 | - starlette==0.37.2 136 | - sympy==1.13.2 137 | - termcolor==2.4.0 138 | - tiktoken==0.7.0 139 | - tokenizers==0.15.2 140 | - torch==2.3.0 141 | - tqdm==4.66.5 142 | - transformers==4.38.2 143 | - triton==2.3.0 144 | - trl==0.9.6 145 | - typing-extensions==4.12.2 146 | - tyro==0.8.8 147 | - tzdata==2024.1 148 | - urllib3==2.2.2 149 | - uvicorn==0.30.6 150 | - uvloop==0.19.0 151 | - vllm==0.4.3 152 | - vllm-flash-attn==2.5.8.post2 153 | - wandb==0.17.7 154 | - watchfiles==0.23.0 155 | - websockets==12.0 156 | - xformers==0.0.26.post1 157 | - xxhash==3.5.0 158 | - yarl==1.9.4 159 | prefix: /mnt/lptest/Miniconda3/envs/dpo 160 | -------------------------------------------------------------------------------- /codes_datasets/Postraining_dpo/xllm/llama_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | import logging 3 | import torch 4 | import transformers 5 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb 6 | from einops import rearrange 7 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func 8 | from flash_attn.bert_padding import unpad_input, pad_input 9 | from loguru import logger 10 | 11 | def forward( 12 | self, 13 | hidden_states: torch.Tensor, 14 | attention_mask: Optional[torch.Tensor] = None, 15 | position_ids: Optional[torch.Tensor] = None, 16 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 17 | output_attentions: bool = False, 18 | use_cache: bool = False, 19 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: 20 | """Input shape: Batch x Time x Channel 21 | 22 | attention_mask: [bsz, q_len] 23 | """ 24 | bsz, q_len, _ = hidden_states.size() 25 | 26 | query_states = ( 27 | self.q_proj(hidden_states) 28 | .view(bsz, q_len, self.num_heads, self.head_dim) 29 | .transpose(1, 2) 30 | ) 31 | key_states = ( 32 | self.k_proj(hidden_states) 33 | .view(bsz, q_len, self.num_heads, self.head_dim) 34 | .transpose(1, 2) 35 | ) 36 | value_states = ( 37 | self.v_proj(hidden_states) 38 | .view(bsz, q_len, self.num_heads, self.head_dim) 39 | .transpose(1, 2) 40 | ) 41 | # [bsz, q_len, nh, hd] 42 | # [bsz, nh, q_len, hd] 43 | 44 | kv_seq_len = key_states.shape[-2] 45 | assert past_key_value is None, "past_key_value is not supported" 46 | 47 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) 48 | query_states, key_states = apply_rotary_pos_emb( 49 | query_states, key_states, cos, sin, position_ids 50 | ) 51 | # [bsz, nh, t, hd] 52 | assert not output_attentions, "output_attentions is not supported" 53 | assert not use_cache, "use_cache is not supported" 54 | 55 | # Flash attention codes from 56 | # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py 57 | 58 | # transform the data into the format required by flash attention 59 | qkv = torch.stack( 60 | [query_states, key_states, value_states], dim=2 61 | ) # [bsz, nh, 3, q_len, hd] 62 | qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd] 63 | # We have disabled _prepare_decoder_attention_mask in LlamaModel 64 | # the attention_mask should be the same as the key_padding_mask 65 | key_padding_mask = attention_mask 66 | 67 | if key_padding_mask is None: 68 | qkv = rearrange(qkv, "b s ... -> (b s) ...") 69 | max_s = q_len 70 | cu_q_lens = torch.arange( 71 | 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device 72 | ) 73 | print("cu_q_lens:", cu_q_lens.size()) 74 | output = flash_attn_varlen_qkvpacked_func( 75 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 76 | ) 77 | output = rearrange(output, "(b s) ... -> b s ...", b=bsz) 78 | else: 79 | nheads = qkv.shape[-2] 80 | x = rearrange(qkv, "b s three h d -> b s (three h d)") 81 | x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask) 82 | x_unpad = rearrange( 83 | x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads 84 | ) 85 | output_unpad = flash_attn_varlen_qkvpacked_func( 86 | x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 87 | ) 88 | output = rearrange( 89 | pad_input( 90 | rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len 91 | ), 92 | "b s (h d) -> b s h d", 93 | h=nheads, 94 | ) 95 | return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, None 96 | 97 | 98 | # Disable the transformation of the attention mask in LlamaModel as the flash attention 99 | # requires the attention mask to be the same as the key_padding_mask 100 | def _prepare_decoder_attention_mask( 101 | self, attention_mask, input_shape, inputs_embeds, past_key_values_length 102 | ): 103 | # [bsz, seq_len] 104 | return attention_mask 105 | 106 | 107 | def replace_llama_attn_with_flash_attn(): 108 | cuda_major, cuda_minor = torch.cuda.get_device_capability() 109 | if cuda_major < 8: 110 | logging.warning( 111 | "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward." 112 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593" 113 | ) 114 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( 115 | _prepare_decoder_attention_mask 116 | ) 117 | logger.warning("Replace with flash_attention") 118 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward 119 | -------------------------------------------------------------------------------- /codes_datasets/DataCleaning/utils/ocr_nlp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sys 4 | import json 5 | import hashlib 6 | import re 7 | import multiprocessing as mp 8 | import argparse 9 | from os import listdir 10 | from tqdm import tqdm 11 | from util import load_set_from_txt 12 | 13 | PROCESS = 2 14 | 15 | def extract_sentences_with_colon(text): 16 | sentence_delimiters = r'[,.,。;!?]' 17 | sentences = re.split(sentence_delimiters, text) 18 | extracted_sentences = [] 19 | remaining_text = "" 20 | 21 | for sentence in sentences: 22 | if ':' in sentence or ':' in sentence: 23 | extracted_sentences.append(sentence.strip()) 24 | else: 25 | remaining_text += sentence.strip() + " " 26 | 27 | return extracted_sentences, remaining_text.strip() 28 | 29 | def controller(input_file): 30 | for line in open(input_file,'r',encoding="utf-8"): 31 | line = line.strip() 32 | if len(line) < 5:continue 33 | 34 | js_dict = json.loads(line) 35 | item_id = js_dict["item_id"] 36 | ocr_ret_list = js_dict["ocr_ret"] 37 | ocr_text = "" 38 | for item in ocr_ret_list: 39 | image_name = item["img_name"] 40 | ocr_ret = item["ocr_ret"] 41 | one_img_content = concat_one_img(ocr_ret) 42 | if len(one_img_content) < 5:continue 43 | if ocr_text != "": ocr_text += "。" 44 | ocr_text += one_img_content 45 | yield item_id, ocr_text 46 | 47 | #reload(sys) 48 | #sys.setdefaultencoding('utf-8') 49 | 50 | ''' 51 | {"ocr_ret": [{"img_name": "/vmware_data/gaodiqi/jingdong_imgs/100000040875/detailimg_e6d026e0d93c15d7174425f9c778eb7e.jpg", "ocr_ret": [{"index": [[248.0, 172.0], [619.0, 172.0], [619.0, 217.0], [248.0, 217.0]], "content": " 年风霜,匠心如初", "confidence": 0.9926699995994568}, {"index": [[58.0, 284.0], [741.0, 283.0], [741.0, 324.0], [58.0, 325.0]], "content": "品质依然,福东海健康食材的选择", "confidence": 0.9772330522537231}, {"index": [[246. 0, 351.0], [306.0, 351.0], [306.0, 371.0], [246.0, 371.0]], "content": "黄民", "confidence": 0.8339320421218872}, {"index": [[406.0, 352.0], [470.0, 349.0], [471.0, 369.0], [407.0, 372.0]], "content": "胎菊", "confidence": 0 .8963175415992737}, {"index": [[571.0, 351.0], [631.0, 351.0], [631.0, 371.0], [571.0, 371.0]], "content": "贡菊", "confidence": 0.8061279058456421}, 52 | ''' 53 | def concat_one_img(ocr_ret_list): 54 | 55 | ans = "" 56 | duplicate_set = set() 57 | for item in ocr_ret_list: 58 | index = item["index"] 59 | content = item["content"].strip() 60 | if len(content) < 1:continue 61 | md5 = hashlib.md5(content.encode('utf-8')).hexdigest() 62 | #print("md5:",md5) 63 | if md5 in duplicate_set: continue 64 | duplicate_set.add(md5) 65 | if ans != "": ans = "," 66 | ans += content 67 | return ans 68 | 69 | def extract(input): 70 | item_id, ocr_text = input 71 | pairs, text = [], '' 72 | if len(ocr_text) > 10: 73 | pairs, text = extract_sentences_with_colon(ocr_text) 74 | output = { 75 | "id": item_id, 76 | "source": "OCR", 77 | "source_id":"", 78 | "content": {"pairs": pairs,"text": text, "qa":""} 79 | } 80 | return output 81 | 82 | def HandleSingleFile(input_file, output): 83 | pools = mp.Pool(PROCESS) 84 | 85 | flush_steps = 0 86 | flush_per_steps = 50 87 | for res in pools.imap(extract, controller(input_file)): 88 | if res is not None: 89 | jstr = json.dumps(res, ensure_ascii=False) 90 | output.write(jstr+"\n") 91 | flush_steps += 1 92 | if flush_steps % flush_per_steps == 0: 93 | output.flush() 94 | 95 | def parse_args(): 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument('--source_path', 98 | type=str, 99 | default="/root/llm/source_data/cn-JD-ocrtext/", 100 | help='Directory containing trained actor model') 101 | parser.add_argument('--dest_path', 102 | type=str, 103 | default="/root/llm/clean_data/cn-JD-ocrtext/", 104 | help='Directory containing trained actor model') 105 | args = parser.parse_args() 106 | return args 107 | 108 | if __name__ == "__main__": 109 | args = parse_args() 110 | files = sorted(listdir(args.source_path)) 111 | 112 | Output_Dir = os.path.join(args.dest_path) 113 | 114 | if not os.path.exists(Output_Dir): 115 | os.makedirs(Output_Dir, exist_ok=True) 116 | 117 | for input_file in tqdm(files,total=len(files)): 118 | input = os.path.join(args.source_path, input_file) 119 | output_file = os.path.join(Output_Dir, input_file) 120 | if os.path.exists(output_file): os.remove(output_file) 121 | output = open(output_file, 'a+', encoding='utf-8') 122 | 123 | HandleSingleFile(input, output) 124 | 125 | output.close() 126 | 127 | --------------------------------------------------------------------------------