└── codes_datasets
    ├── DataCleaning
        ├── utils
        │   ├── .gitkeep
        │   ├── __init__.py
        │   ├── law_lowwords.txt
        │   ├── word2phrase
        │   ├── spam_words_wudao.txt
        │   ├── __pycache__
        │   │   ├── util.cpython-38.pyc
        │   │   ├── __init__.cpython-37.pyc
        │   │   ├── __init__.cpython-38.pyc
        │   │   ├── general_policy.cpython-37.pyc
        │   │   ├── general_policy.cpython-38.pyc
        │   │   ├── special_policy.cpython-38.pyc
        │   │   ├── check_black_words.cpython-38.pyc
        │   │   └── clean_headtails_from_content.cpython-38.pyc
        │   ├── ebook_lowwords.txt
        │   ├── ads_predict_fasttext.py
        │   ├── check_political_toxic.py
        │   ├── opencc_t2s.py
        │   ├── ray_utils.py
        │   ├── split_bigdata_file.py
        │   ├── random_sample.py
        │   ├── pretrain_data_sampling.py
        │   ├── safetycheck_random_sample.py
        │   ├── clean_headtails_from_content.py
        │   ├── test60.py
        │   ├── tokenizer.py
        │   ├── special_policy.py
        │   └── ocr_nlp.py
        ├── clean
        │   ├── __init__.py
        │   └── hjyroot
        │   │   └── sftDataGen
        │   │       └── clean-dataset
        │   │           ├── jd_cleaned_v1.jsonl
        │   │           └── .ipynb_checkpoints
        │   │               └── jd_cleaned_v1-checkpoint.jsonl
        ├── requirements.txt
        ├── stopall.sh
        ├── fasttext
        │   ├── README.md
        │   └── run_fastext.sh
        ├── run_tokenizer.sh
        ├── README.md
        ├── run_data_cleaning.sh
        └── preprocess
        │   ├── .ipynb_checkpoints
        │       ├── preprocess_cn-baidu_weixin-checkpoint.py
        │       ├── preprocess_cn-e-txt-checkpoint.py
        │       ├── search_pretraindata-checkpoint.py
        │       ├── preprocess_cn-kindle-checkpoint.py
        │       ├── preprocess_cn-39health-checkpoint.py
        │       ├── preprocess_cn-sina_iask-checkpoint.py
        │       ├── mnbvc_prepare-checkpoint.py
        │       └── preprocess_cn-mnbvc-checkpoint.py
        │   └── preprocess_cn-wechat.py
    ├── Postraining_dpo
        ├── tests
        │   ├── __init__.py
        │   ├── test_e2e.py
        │   ├── testing_constants.py
        │   ├── test_core.py
        │   ├── testing_utils.py
        │   ├── test_best_of_n_sampler.py
        │   ├── test_ddpo_trainer.py
        │   ├── test_data_collator_completion_only.py
        │   └── test_iterative_sft_trainer.py
        ├── examples
        │   ├── accelerate_configs
        │   │   ├── path
        │   │   │   └── hostfile4
        │   │   ├── multi_gpu.yaml
        │   │   ├── deepspeed_zero1.yaml
        │   │   ├── deepspeed_zero2.yaml
        │   │   ├── deepspeed_zero3.yaml
        │   │   ├── zero3_multi_nodes.yaml
        │   │   └── zero2_multi_nodes.yaml
        │   ├── research_projects
        │   │   ├── stack_llama_2
        │   │   │   └── scripts
        │   │   │   │   ├── requirements.txt
        │   │   │   │   └── README.md
        │   │   ├── toxicity
        │   │   │   └── README.md
        │   │   ├── README.md
        │   │   ├── stack_llama
        │   │   │   └── scripts
        │   │   │   │   ├── README.md
        │   │   │   │   └── merge_peft_adapter.py
        │   │   └── tools
        │   │   │   └── calculator.py
        │   ├── notebooks
        │   │   └── README.md
        │   └── hello_world.py
        ├── scripts
        │   ├── hostfile4
        │   ├── killall.sh
        │   ├── stopall.sh
        │   ├── accelerate_configs
        │   │   ├── single_gpu.yaml
        │   │   ├── multi_gpu.yaml
        │   │   ├── deepspeed_zero1.yaml
        │   │   ├── deepspeed_zero2.yaml
        │   │   ├── deepspeed_zero3.yaml
        │   │   └── fsdp_qlora.yaml
        │   ├── dpo_infer.sh
        │   ├── dpo_pairwise_winrate.sh
        │   ├── profile
        │   ├── dpo_pairwise_score.sh
        │   └── postrain_with_dpo.sh
        ├── trl
        │   ├── __pycache__
        │   │   ├── core.cpython-310.pyc
        │   │   ├── core.cpython-38.pyc
        │   │   ├── __init__.cpython-38.pyc
        │   │   ├── __init__.cpython-310.pyc
        │   │   ├── env_utils.cpython-310.pyc
        │   │   ├── import_utils.cpython-310.pyc
        │   │   └── import_utils.cpython-38.pyc
        │   ├── models
        │   │   ├── __pycache__
        │   │   │   ├── utils.cpython-310.pyc
        │   │   │   ├── __init__.cpython-310.pyc
        │   │   │   ├── __init__.cpython-38.pyc
        │   │   │   ├── modeling_base.cpython-310.pyc
        │   │   │   ├── modeling_base.cpython-38.pyc
        │   │   │   ├── modeling_value_head.cpython-38.pyc
        │   │   │   └── modeling_value_head.cpython-310.pyc
        │   │   ├── __init__.py
        │   │   └── auxiliary_modules.py
        │   ├── trainer
        │   │   ├── __pycache__
        │   │   │   ├── base.cpython-310.pyc
        │   │   │   ├── base.cpython-38.pyc
        │   │   │   ├── utils.cpython-38.pyc
        │   │   │   ├── __init__.cpython-38.pyc
        │   │   │   ├── judges.cpython-310.pyc
        │   │   │   ├── utils.cpython-310.pyc
        │   │   │   ├── __init__.cpython-310.pyc
        │   │   │   ├── callbacks.cpython-310.pyc
        │   │   │   ├── ddpo_config.cpython-310.pyc
        │   │   │   ├── ddpo_config.cpython-38.pyc
        │   │   │   ├── dpo_config.cpython-310.pyc
        │   │   │   ├── dpo_trainer.cpython-310.pyc
        │   │   │   ├── dpo_trainer.cpython-38.pyc
        │   │   │   ├── model_config.cpython-310.pyc
        │   │   │   ├── ppo_config.cpython-310.pyc
        │   │   │   ├── ppo_trainer.cpython-310.pyc
        │   │   │   ├── sft_trainer.cpython-310.pyc
        │   │   │   ├── reward_trainer.cpython-310.pyc
        │   │   │   ├── training_configs.cpython-310.pyc
        │   │   │   └── iterative_sft_trainer.cpython-310.pyc
        │   │   ├── online_dpo_config.py
        │   │   ├── rloo_config.py
        │   │   ├── ppov2_config.py
        │   │   ├── base.py
        │   │   ├── reward_config.py
        │   │   ├── training_configs.py
        │   │   ├── orpo_config.py
        │   │   ├── sft_config.py
        │   │   ├── model_config.py
        │   │   ├── alignprop_config.py
        │   │   └── cpo_config.py
        │   ├── extras
        │   │   ├── __pycache__
        │   │   │   ├── __init__.cpython-310.pyc
        │   │   │   ├── __init__.cpython-38.pyc
        │   │   │   ├── best_of_n_sampler.cpython-310.pyc
        │   │   │   └── best_of_n_sampler.cpython-38.pyc
        │   │   ├── __init__.py
        │   │   └── dataset_formatting.py
        │   ├── commands
        │   │   ├── __pycache__
        │   │   │   ├── __init__.cpython-310.pyc
        │   │   │   └── cli_utils.cpython-310.pyc
        │   │   ├── __init__.py
        │   │   └── cli.py
        │   ├── environment
        │   │   ├── __pycache__
        │   │   │   ├── __init__.cpython-310.pyc
        │   │   │   ├── __init__.cpython-38.pyc
        │   │   │   ├── base_environment.cpython-38.pyc
        │   │   │   └── base_environment.cpython-310.pyc
        │   │   └── __init__.py
        │   └── env_utils.py
        ├── xllm
        │   ├── __pycache__
        │   │   ├── util.cpython-310.pyc
        │   │   ├── data_decrypt.cpython-310.pyc
        │   │   └── request_http.cpython-310.pyc
        │   ├── dpo_win_state.py
        │   └── llama_flash_attn_monkey_patch.py
        ├── killall.sh
        ├── stopall.sh
        ├── README.md
        └── requirements
        │   ├── pip_dpo_requirements.txt
        │   └── conda_dpo_requirements.txt
    ├── PreferData
        └── README.md
    └── SFTData
        └── README.md


/codes_datasets/DataCleaning/utils/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/clean/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codes_datasets/PreferData/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | # Preference Data Recipe
4 | 


--------------------------------------------------------------------------------
/codes_datasets/SFTData/README.md:
--------------------------------------------------------------------------------
1 | 中英文指令数据集，需要根据自己的业务情况，进行清洗，指令数据的质量是影响效果的关键因素！
2 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/clean/hjyroot/sftDataGen/clean-dataset/jd_cleaned_v1.jsonl:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/law_lowwords.txt:
--------------------------------------------------------------------------------
1 | 审判员
2 | 书记员
3 | 审判长
4 | 人民陪审员
5 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/accelerate_configs/path/hostfile4:
--------------------------------------------------------------------------------
1 | ip1 slots=8
2 | ip2 slots=8
3 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/clean/hjyroot/sftDataGen/clean-dataset/.ipynb_checkpoints/jd_cleaned_v1-checkpoint.jsonl:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/word2phrase:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/word2phrase


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/spam_words_wudao.txt:
--------------------------------------------------------------------------------
1 | 图片发自简书app
2 | 原文地址:
3 | 综合网络,如有侵权联系删除。
4 | 本文章已经通过区块链技术进行版权认证,禁止任何形式的改编转载抄袭,违者追究法律责任
5 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/hostfile4:
--------------------------------------------------------------------------------
1 | 15.108.121.45 slots=8
2 | 15.108.121.46 slots=8
3 | 15.108.121.47 slots=8
4 | 15.108.121.48 slots=8
5 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/__pycache__/util.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/util.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/__pycache__/core.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/core.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/__pycache__/core.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/core.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/xllm/__pycache__/util.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/xllm/__pycache__/util.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/research_projects/stack_llama_2/scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | trl
3 | peft
4 | accelerate
5 | datasets
6 | bitsandbytes
7 | wandb
8 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/__pycache__/env_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/env_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/__pycache__/general_policy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/general_policy.cpython-37.pyc


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/__pycache__/general_policy.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/general_policy.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/__pycache__/special_policy.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/special_policy.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/__pycache__/import_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/import_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/__pycache__/import_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/__pycache__/import_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/models/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/base.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/base.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/base.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/base.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/utils.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/__pycache__/check_black_words.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/check_black_words.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/killall.sh:
--------------------------------------------------------------------------------
1 | source /etc/profile
2 | pdsh -w ssh:15.108.121.45,15.108.121.46,15.108.121.47,15.108.121.48 "bash /mnt/lptest/xubu/postrain/scripts/stopall.sh"
3 | 
4 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/extras/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/extras/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/extras/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/extras/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/models/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/models/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/judges.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/judges.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/xllm/__pycache__/data_decrypt.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/xllm/__pycache__/data_decrypt.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/xllm/__pycache__/request_http.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/xllm/__pycache__/request_http.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/commands/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/commands/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/callbacks.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/callbacks.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/commands/__pycache__/cli_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/commands/__pycache__/cli_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/environment/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/environment/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/environment/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/environment/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_base.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_base.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_base.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_base.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ddpo_config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ddpo_config.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ddpo_config.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ddpo_config.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_config.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_trainer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_trainer.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_trainer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/dpo_trainer.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/model_config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/model_config.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ppo_config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ppo_config.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ppo_trainer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/ppo_trainer.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/sft_trainer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/sft_trainer.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/reward_trainer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/reward_trainer.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/extras/__pycache__/best_of_n_sampler.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/extras/__pycache__/best_of_n_sampler.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/extras/__pycache__/best_of_n_sampler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/extras/__pycache__/best_of_n_sampler.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_value_head.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_value_head.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/training_configs.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/training_configs.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/__pycache__/clean_headtails_from_content.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/DataCleaning/utils/__pycache__/clean_headtails_from_content.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/environment/__pycache__/base_environment.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/environment/__pycache__/base_environment.cpython-38.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_value_head.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/models/__pycache__/modeling_value_head.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/environment/__pycache__/base_environment.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/environment/__pycache__/base_environment.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/iterative_sft_trainer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubuvd/LLMs/HEAD/codes_datasets/Postraining_dpo/trl/trainer/__pycache__/iterative_sft_trainer.cpython-310.pyc


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/tests/test_e2e.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | 
 4 | def test_hello_world():
 5 |     subprocess.run(
 6 |         "python examples/hello_world.py",
 7 |         shell=True,
 8 |         check=True,
 9 |     )
10 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/killall.sh:
--------------------------------------------------------------------------------
1 | source /etc/profile
2 | pdsh -w ssh:10.208.111.45,10.208.112.209,10.208.109.54,10.208.110.235 "bash /mnt/lptest/xubu/postrain/stopall.sh"
3 | pdsh -w ssh:10.208.111.45,10.208.112.209,10.208.109.54,10.208.110.235 "rm -rf /tmp/*"
4 | 
5 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/stopall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ps aux|grep postrain_with_dpo.sh |grep -v grep | awk '{print $2}' | while read pid ;do  kill -9 $pid;done;
3 | ps aux|grep postrain.py |grep -v grep | awk '{print $2}' | while read pid ;do  kill -9 $pid;done;
4 | 
5 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/stopall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ps aux|grep postrain_with_dpo.sh |grep -v grep | awk '{print $2}' | while read pid ;do  kill -9 $pid;done;
3 | ps aux|grep postrain.py |grep -v grep | awk '{print $2}' | while read pid ;do  kill -9 $pid;done;
4 | 
5 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/research_projects/toxicity/README.md:
--------------------------------------------------------------------------------
1 | # De-detoxifying language models
2 | 
3 | To run this code, do the following:
4 | 
5 | ```shell
6 | ACCELERATE_LOG_LEVEL=info accelerate launch --config_file {CONFIG} examples/toxicity/scripts/gpt-j-6b-toxicity.py --log_with wandb
7 | ```


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers
 2 | torch
 3 | opencc
 4 | flashtext
 5 | tomark
 6 | SentencePiece
 7 | chardet
 8 | apache-flink
 9 | jieba
10 | pymysql
11 | dbutils
12 | xToolkit
13 | huggingface_hub
14 | openyxl
15 | jieba_fast
16 | TextBlob
17 | fpgrowth_py
18 | jsonlines
19 | openai
20 | pysubs2
21 | langid
22 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/stopall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ps aux|grep wanjuan_clean.py |grep -v grep | awk '{print $2}' | while read pid ;do  kill -9 $pid;done;
3 | ps aux|grep tokenizer.py |grep -v grep | awk '{print $2}' | while read pid ;do  kill -9 $pid;done;
4 | ps aux|grep random_sample.py |grep -v grep | awk '{print $2}' | while read pid ;do  kill -9 $pid;done;
5 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/fasttext/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # 🌱 Training fastext recipe
 4 | ```
 5 | train_file = './data/clean_data.txt.train'
 6 | python train.py \
 7 |     --input_file $train_file \
 8 |     --pretrain_w2v ./w2v/wiki-news-300d-1M.vec \
 9 |     --model_file ./output_models/imagenet.bin \
10 |     --stop_word_path ./data/stop_words_en.txt \
11 |     --output_path ./data/imagenet_res.jsonl \
12 |     --mode train
13 | ```
14 | 
15 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/accelerate_configs/single_gpu.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: "NO"
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: 'bf16'
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/accelerate_configs/multi_gpu.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: 'bf16'
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/accelerate_configs/multi_gpu.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: 'bf16'
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/dpo_infer.sh:
--------------------------------------------------------------------------------
 1 | #!bin/bash
 2 | 
 3 | 
 4 | models=("dpo_ckpt_llama-70b_5e6_3epoch" "dpo_ckpt_llama-70b_5e6_6epoch")
 5 | for model in ${models[*]}
 6 | do
 7 |     echo "infer model ${model} on test dataset of ${dataset_name} ..."
 8 |     CUDA_VISIBLE_DEVICES='0,1' python dpo_generation_vllm.py \
 9 |         --input_file ../dpo_dataGen/xllm_eval_500_final.jsonl \
10 |         --model_name $model \
11 |         --batch_size 4
12 | done
13 | 
14 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/ebook_lowwords.txt:
--------------------------------------------------------------------------------
 1 | 声明：
 2 | 声明
 3 | 本书由
 4 | 采集整理
 5 | 仅供试读
 6 | 版权归原作者所有
 7 | 如有侵权
 8 | 请联系本站
 9 | 及时删除
10 | 提醒您：
11 | 提醒您
12 | 合理安排阅读时间
13 | 杜绝沉迷
14 | 网络小说
15 | 作者：
16 | 编简介
17 | 小说类别
18 | 总推荐
19 | 标签：
20 | 读者印象：
21 | 疑难解答
22 | 更新时间
23 | 本章字数
24 | 好书尽在疑难解答
25 | 网更新时间
26 | 本章字数
27 | 版权信息
28 | COPYRIGHT
29 | 书名
30 | 出版社
31 | 出版时间
32 | ISBN
33 | 授权得到
34 | APP电子版
35 | 制作与发行版权
36 | 所有侵权必究
37 | 更多精校小说
38 | 下载
39 | 更多小说下载
40 | 百度贴吧
41 | 收集整理
42 | 更多精校小说尽在
43 | 知轩藏书
44 | 全书完
45 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/environment/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | from typing import TYPE_CHECKING
 3 | from ..import_utils import _LazyModule
 4 | 
 5 | _import_structure = {
 6 |     "base_environment": ["TextEnvironment", "TextHistory"],
 7 | }
 8 | 
 9 | if TYPE_CHECKING:
10 |     from .base_environment import TextEnvironment, TextHistory
11 | else:
12 |     import sys
13 | 
14 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
15 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/fasttext/run_fastext.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | train_file=./data/clean_data.txt.train
 4 | 
 5 | python train.py \
 6 |     --input_file $train_file \
 7 |     --pretrain_w2v ./w2v/wiki-news-300d-1M.vec \
 8 |     --model_file ./output_models/imagenet.bin \
 9 |     --stop_word_path ./data/stop_words_en.txt \
10 |     --output_path ./data/imagenet_res.jsonl \
11 |     --mode train
12 | if [ $? -ne 0 ]; then
13 |     echo "train.py: ${train_file} failed."
14 |     exit
15 | else
16 |     echo "train.py: ${train_file} succeed."
17 | fi
18 | 
19 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/accelerate_configs/deepspeed_zero1.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   gradient_accumulation_steps: 1
 6 |   zero3_init_flag: false
 7 |   zero_stage: 1
 8 | distributed_type: DEEPSPEED
 9 | downcast_bf16: 'no'
10 | machine_rank: 0
11 | main_training_function: main
12 | mixed_precision: 'bf16'
13 | num_machines: 1
14 | num_processes: 8
15 | rdzv_backend: static
16 | same_network: true
17 | tpu_env: []
18 | tpu_use_cluster: false
19 | tpu_use_sudo: false
20 | use_cpu: false
21 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/accelerate_configs/deepspeed_zero1.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   gradient_accumulation_steps: 1
 6 |   zero3_init_flag: false
 7 |   zero_stage: 1
 8 | distributed_type: DEEPSPEED
 9 | downcast_bf16: 'no'
10 | machine_rank: 0
11 | main_training_function: main
12 | mixed_precision: 'bf16'
13 | num_machines: 1
14 | num_processes: 8
15 | rdzv_backend: static
16 | same_network: true
17 | tpu_env: []
18 | tpu_use_cluster: false
19 | tpu_use_sudo: false
20 | use_cpu: false
21 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/dpo_pairwise_winrate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | models=("dpo_ckpt_llama-70b_5e6_3epoch" "dpo_ckpt_llama-70b_5e6_6epoch")
 4 | for model_compared in ${models[*]}
 5 | do
 6 |     for eval_file in 'xllmtest'
 7 |     do
 8 |         k1=sft-checkpoint-72800
 9 |         k2=$model_compared
10 | 
11 |         python win_tie_loss_stat.py \
12 |             -i1 ${k1}-${k2}-${eval_file}.json \
13 |             -k1 $k1 \
14 |             -i2 ${k2}-${k1}-${eval_file}.json \
15 |             -k2 $k2 \
16 |             --output_dir ./ \
17 |             --dst $eval_file
18 |     done
19 | done
20 | 
21 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/accelerate_configs/deepspeed_zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: 'bf16'
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false
22 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/accelerate_configs/deepspeed_zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   gradient_accumulation_steps: 1
 6 |   offload_optimizer_device: none
 7 |   offload_param_device: none
 8 |   zero3_init_flag: false
 9 |   zero_stage: 2
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: 'bf16'
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/research_projects/README.md:
--------------------------------------------------------------------------------
1 | # Research projects that use TRL
2 | 
3 | Welcome to the research projects folder! Here you can find the scripts used for some research projects that used TRL and maintained by the developers and the community (LM de-toxification, Stack-Llama, etc.). Check out the READMEs in the subfolders for more information!
4 | 
5 | - [De-detoxifying language models](https://github.com/huggingface/trl/tree/main/examples/research_projects/toxicity)
6 | - [Stack-Llama](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama)
7 | - [Stack-Llama-2](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2)


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/accelerate_configs/deepspeed_zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   gradient_accumulation_steps: 1
 6 |   offload_optimizer_device: none
 7 |   offload_param_device: none
 8 |   zero3_init_flag: true
 9 |   zero3_save_16bit_model: true
10 |   zero_stage: 3
11 | distributed_type: DEEPSPEED
12 | downcast_bf16: 'no'
13 | machine_rank: 0
14 | main_training_function: main
15 | mixed_precision: 'bf16'
16 | num_machines: 1
17 | num_processes: 8
18 | rdzv_backend: static
19 | same_network: true
20 | tpu_env: []
21 | tpu_use_cluster: false
22 | tpu_use_sudo: false
23 | use_cpu: false
24 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/run_tokenizer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dataset="Alpaca52k"
 4 | clearning_version="v1"
 5 | 
 6 | src_dir="/Users/mac/Downloads/sft/paper/sft_train4"
 7 | dest_dir="/Users/mac/Downloads/sft/paper/sft_data"
 8 | num_workers=1
 9 | 
10 | tokenizer_path="/mnt/public/open_source_AI/Meta-Llama-3.1-8B-Instruct"
11 | python utils/tokenizer.py \
12 |     --dataset_name ${dataset} \
13 |     --dataset_path ${src_dir} \
14 |     --output_path ${dest_dir} \
15 |     --tokenizer_path ${tokenizer_path} \
16 |     --version ${clearning_version} \
17 |     --num_workers ${num_workers}
18 | if [ $? -ne 0 ]; then
19 |     echo "tokenizer.py failed."
20 |     exit
21 | else
22 |     echo "tokenizer.py succeed."
23 | fi
24 | 
25 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/profile:
--------------------------------------------------------------------------------
 1 | # /etc/profile: system-wide .profile file for the Bourne shell (sh(1))
 2 | # and Bourne compatible shells (bash(1), ksh(1), ash(1), ...).
 3 | 
 4 | if [ "${PS1-}" ]; then
 5 |   if [ "${BASH-}" ] && [ "$BASH" != "/bin/sh" ]; then
 6 |     # The file bash.bashrc already sets the default PS1.
 7 |     # PS1='\h:\w\$ '
 8 |     if [ -f /etc/bash.bashrc ]; then
 9 |       . /etc/bash.bashrc
10 |     fi
11 |   else
12 |     if [ "$(id -u)" -eq 0 ]; then
13 |       PS1='# '
14 |     else
15 |       PS1='$ '
16 |     fi
17 |   fi
18 | fi
19 | 
20 | if [ -d /etc/profile.d ]; then
21 |   for i in /etc/profile.d/*.sh; do
22 |     if [ -r $i ]; then
23 |       . $i
24 |     fi
25 |   done
26 |   unset i
27 | fi
28 | export PDSH_RCMD_TYPE=ssh
29 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/accelerate_configs/zero3_multi_nodes.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_hostfile: /path/hostfile 
 5 |   deepspeed_multinode_launcher: pdsh
 6 |   gradient_accumulation_steps: 1
 7 |   offload_optimizer_device: none
 8 |   offload_param_device: none
 9 |   zero3_init_flag: true
10 |   zero3_save_16bit_model: true
11 |   zero_stage: 3
12 | distributed_type: DEEPSPEED
13 | downcast_bf16: 'no'
14 | machine_rank: 0
15 | main_process_ip: 10.208.110.235
16 | main_process_port: 50528
17 | main_training_function: main
18 | mixed_precision: bf16
19 | num_machines: 4
20 | num_processes: 32
21 | rdzv_backend: static
22 | same_network: false
23 | tpu_env: []
24 | tpu_use_cluster: false
25 | tpu_use_sudo: false
26 | use_cpu: false
27 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/accelerate_configs/deepspeed_zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_hostfile: /mnt/lptest/xubu/postrain/scripts/hostfile4
 5 |   deepspeed_multinode_launcher: pdsh
 6 |   gradient_accumulation_steps: 2
 7 |   offload_optimizer_device: none
 8 |   offload_param_device: none
 9 |   zero3_init_flag: true
10 |   zero3_save_16bit_model: true
11 |   zero_stage: 3
12 | distributed_type: DEEPSPEED
13 | main_process_ip: 15.108.121.45
14 | main_process_port: 5158
15 | downcast_bf16: 'no'
16 | machine_rank: 0
17 | main_training_function: main
18 | mixed_precision: bf16
19 | num_machines: 4
20 | num_processes: 32
21 | rdzv_backend: static
22 | same_network: true
23 | tpu_env: []
24 | tpu_use_cluster: false
25 | tpu_use_sudo: false
26 | use_cpu: false
27 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/accelerate_configs/zero2_multi_nodes.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_hostfile: /data/usr/pangwei/frontllm/sft/dpo/examples/accelerate_configs/path/hostfile
 5 |   deepspeed_multinode_launcher: pdsh
 6 |   gradient_accumulation_steps: 1
 7 |   offload_optimizer_device: none
 8 |   offload_param_device: none
 9 |   zero3_init_flag: false
10 |   zero_stage: 2
11 | distributed_type: DEEPSPEED
12 | downcast_bf16: 'no'
13 | machine_rank: 0
14 | main_process_ip: 172.16.19.45
15 | main_process_port: 2222
16 | main_training_function: main
17 | mixed_precision: bf16
18 | num_machines: 2
19 | num_processes: 16
20 | rdzv_backend: static
21 | same_network: false
22 | tpu_env: []
23 | tpu_use_cluster: false
24 | tpu_use_sudo: false
25 | use_cpu: false
26 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/online_dpo_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | from typing import Literal
 4 | 
 5 | from trl.trainer.utils import OnPolicyConfig
 6 | 
 7 | 
 8 | @dataclass
 9 | class OnlineDPOConfig(OnPolicyConfig):
10 |     exp_name: str = os.path.basename(__file__)[: -len(".py")]
11 |     """the name of this experiment"""
12 |     reward_model_path: str = "EleutherAI/pythia-160m"
13 |     """the path to the reward model"""
14 | 
15 |     num_epochs: int = 4
16 |     """the number of epochs to train"""
17 | 
18 |     beta: float = 0.05
19 |     """the entropy regularization coefficient of DPO"""
20 |     loss_type: Literal["sigmoid", "ipo"] = "sigmoid"
21 |     """the type of loss to use for online DPO"""
22 |     disable_dropout: bool = True
23 |     """whether to disable dropout of the model during training"""
24 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Notebooks
2 | 
3 | This directory contains a collection of Jupyter notebooks that demonstrate how to use the TRL library in different applications.
4 | 
5 | - [`best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb): This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO.
6 | - [`gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb): This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.
7 | - [`gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment-control.ipynb): This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.
8 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/rloo_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | from ..trainer.utils import OnPolicyConfig
 5 | 
 6 | 
 7 | @dataclass
 8 | class RLOOConfig(OnPolicyConfig):
 9 |     exp_name: str = os.path.basename(__file__)[: -len(".py")]
10 |     """the name of this experiment"""
11 |     reward_model_path: str = "EleutherAI/pythia-160m"
12 |     """the path to the reward model"""
13 | 
14 |     # ppo config
15 |     num_ppo_epochs: int = 4
16 |     """the number of epochs to train"""
17 |     whiten_rewards: bool = False
18 |     """whether to whiten the rewards"""
19 |     kl_coef: float = 0.05
20 |     """the KL coefficient"""
21 |     cliprange: float = 0.2
22 |     """the clip range"""
23 | 
24 |     # rloo config
25 |     rloo_k: int = 2
26 |     """REINFORCE Leave-One-Out (RLOO) number of online samples per prompt"""
27 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/tests/testing_constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | CI_HUB_USER = "__DUMMY_TRANSFORMERS_USER__"
16 | CI_HUB_USER_FULL_NAME = "Dummy User"
17 | CI_HUB_USER_TOKEN = "hf_94wBhPGp6KrrTH3KDchhKpRxZwd6dmHWLL"
18 | 
19 | CI_HUB_ENDPOINT = "https://hub-ci.huggingface.co"
20 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/README.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | Author: xubuvd
 3 | Date: 13/08/2024
 4 | Email: xubuvd@163.com
 5 | ```
 6 | 
 7 | # 🌱 数据清洗方案 - Data Cleaning Recipe
 8 | 它包含四个主要阶段：<br>
 9 | 1. **初始数据清洗**：对28个特定领域的数据集应用多种启发式过滤方法。<br>
10 | 2. **文档级去重**：使用 MiniHash 去除重复文档。<br>
11 | 3. **统计分析**：使用 Llama3.1-8b-Instruct 模型分析总词汇量。<br>
12 | 4. **人工评估**：对100个数据点进行抽样和手动审查。<br>
13 | <br>
14 | It consists of four main stages:<br>
15 | 1. **Initial Data Cleaning**: Apply various heuristic filtering methods to 28 domain-specific datasets.<br>
16 | 2. **Document-Level Deduplication**: Use MiniHash to remove duplicate documents.<br>
17 | 3. **Statistical Analysis**: Analyze the total number of tokens using the Llama3.1-8b-Instruct model.<br>
18 | 4. **Human Evaluation**: Conduct a manual review by sampling 100 data points.<br>
19 | 
20 | # 🍂 启动和暂停 - running and killing
21 | ```
22 | nohup bash run_data_cleaning.sh > r.log 2>&1 &
23 | bash stopall.sh
24 | ```
25 | 
26 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # See pip. conda envs at ./requirements/
 3 | ```
 4 | trl, v0.9.6
 5 | conda_dpo_requirements.txt
 6 | pip_dpo_requirements.txt
 7 | ```
 8 | # step1: Post-Training with Direct Preference Optimization
 9 | ```
10 | bash scripts/postrain_with_dpo.sh
11 | ```
12 | 
13 | ## Input data formation
14 | traing data stored in jsonl style, one line is as follows:
15 | ```
16 | {"id":"1","source":"xllm_dataset","prompt":"","chosen":"","reject":""}
17 | {"id":"2","source":"xllm_dataset","prompt":"","chosen":"","reject":""}
18 | {"id":"3","source":"xllm_dataset","prompt":"","chosen":"","reject":""}
19 | ...
20 | ```
21 | 
22 | # step2: Make the trained model infering with vllm
23 | ```
24 | bash dpo_infer.sh
25 | ```
26 | 
27 | # step3: Compare performance of two models of DPO vs. SFT with gpt4-0613
28 | ```
29 | bash dpo_pairwise_score.sh
30 | ```
31 | 
32 | # step4: Calculate win-rate for two compared models
33 | ```
34 | bash dpo_pairwise_winrate.sh 
35 | ```
36 | 
37 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/ppov2_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | from ..trainer.utils import OnPolicyConfig
 5 | 
 6 | 
 7 | @dataclass
 8 | class PPOv2Config(OnPolicyConfig):
 9 |     exp_name: str = os.path.basename(__file__)[: -len(".py")]
10 |     """the name of this experiment"""
11 |     reward_model_path: str = "EleutherAI/pythia-160m"
12 |     """the path to the reward model"""
13 | 
14 |     # ppo config
15 |     num_ppo_epochs: int = 4
16 |     """the number of epochs to train"""
17 |     whiten_rewards: bool = False
18 |     """whether to whiten the rewards"""
19 |     kl_coef: float = 0.05
20 |     """the KL coefficient"""
21 |     cliprange: float = 0.2
22 |     """the clip range"""
23 |     vf_coef: float = 0.1
24 |     """the value function coefficient"""
25 |     cliprange_value: float = 0.2
26 |     """the clip range for the value function"""
27 |     gamma: float = 1
28 |     """the discount factor"""
29 |     lam: float = 0.95
30 |     """the lambda value for GAE"""
31 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/ads_predict_fasttext.py:
--------------------------------------------------------------------------------
 1 | # _*_coding:utf-8 _*_
 2 | import json
 3 | import logging
 4 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 5 | import fasttext 
 6 | from tqdm import tqdm
 7 | 
 8 | #加载模型
 9 | model = fasttext.load_model('./fastText_shortAd/models/fasttext_train.model.bin')
10 | 
11 | labels_right = []
12 | texts = []
13 | labels_predict = []
14 | 
15 | with open("/data/data_warehouse/llm/source_data/cn-wechat/wx_data_980.jsonl") as fr:
16 |     datas = fr.readlines()
17 |     for idx,line in tqdm(enumerate(datas),total=len(datas)):
18 |         line = line.strip()
19 |         if len(line) < 5: continue
20 |         js_dict = json.loads(line)
21 |         text = js_dict["content"].strip()
22 |         text = text.replace("\n"," ")
23 |         label_predict = model.predict(text)
24 |         labels_predict.append(label_predict[0])
25 |         print ("文本: ",text[0:200])
26 |         print ("预测label: ",label_predict[0])
27 |         print("-"*60)
28 | 
29 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/accelerate_configs/fsdp_qlora.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE                                                                                                                                           
 2 | debug: false                                                                                                                                                                 
 3 | distributed_type: FSDP
 4 | downcast_bf16: 'no'
 5 | fsdp_config:
 6 |   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 7 |   fsdp_backward_prefetch: BACKWARD_PRE
 8 |   fsdp_cpu_ram_efficient_loading: true
 9 |   fsdp_forward_prefetch: false
10 |   fsdp_offload_params: true
11 |   fsdp_sharding_strategy: FULL_SHARD
12 |   fsdp_state_dict_type: SHARDED_STATE_DICT
13 |   fsdp_sync_module_states: true
14 |   fsdp_use_orig_params: false
15 | machine_rank: 0
16 | main_training_function: main
17 | mixed_precision: 'bf16'
18 | num_machines: 1
19 | num_processes: 8
20 | rdzv_backend: static
21 | same_network: true
22 | tpu_env: []
23 | tpu_use_cluster: false
24 | tpu_use_sudo: false
25 | use_cpu: false


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/extras/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | 
 3 | # Copyright 2022 The HuggingFace Team. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | from typing import TYPE_CHECKING
17 | 
18 | from ..import_utils import _LazyModule
19 | 
20 | 
21 | _import_structure = {
22 |     "best_of_n_sampler": ["BestOfNSampler"],
23 | }
24 | 
25 | if TYPE_CHECKING:
26 |     from .best_of_n_sampler import BestOfNSampler
27 | else:
28 |     import sys
29 | 
30 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
31 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/dpo_pairwise_score.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # gpt-3.5-turbo-0613
 4 | # gpt-4-0613
 5 | 
 6 | models=("dpo_ckpt_llama-70b_5e6_3epoch" "dpo_ckpt_llama-70b_5e6_6epoch")
 7 | for model in ${models[*]}
 8 | do
 9 |     for eval_file in 'frontis'
10 |     do
11 |         k1=sft-70b_final_95w_3epoch
12 |         k2=$model
13 |         scorer=gpt-4-0613
14 | 
15 |         echo "pairwise compare between ${model} and ${k1} on ${eval_file} ..."
16 |         python xllm/dpo_pairwise_score_by_gpt4.py \
17 |             -i1 ./evaluation/results/${k1}/${eval_file}/seed_3517.json \
18 |             -i2 ./evaluation/results/${k2}/${eval_file}/seed_3517.json \
19 |             -k1 $k1 \
20 |             -k2 $k2 \
21 |             --batch_size 10 \
22 |             --max_tokens 32 \
23 |             --output_dir ./ \
24 |             --eval_scorer $scorer
25 |         
26 |         python xllm/dpo_pairwise_score_by_gpt4.py \
27 |             -i1 ./evaluation/results/${k2}/${eval_file}/seed_3517.json \
28 |             -i2 ./evaluation/results/${k1}/${eval_file}/seed_3517.json \
29 |             -k1 $k2 \
30 |             -k2 $k1 \
31 |             --batch_size 10 \
32 |             --max_tokens 32 \
33 |             --output_dir ./ \
34 |             --eval_scorer $scorer
35 |     done
36 | done
37 | 
38 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | 
 3 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # flake8: noqa
17 | 
18 | from typing import TYPE_CHECKING
19 | from ..import_utils import _LazyModule, OptionalDependencyNotAvailable
20 | 
21 | 
22 | _import_structure = {
23 |     "cli_utils": ["SFTScriptArguments", "init_zero_verbose", "DPOScriptArguments", "TrlParser", "YamlConfigParser"],
24 | }
25 | 
26 | 
27 | if TYPE_CHECKING:
28 |     from .cli_utils import SFTScriptArguments, init_zero_verbose, DPOScriptArguments, TrlParser, YamlConfigParser
29 | else:
30 |     import sys
31 | 
32 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
33 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/check_political_toxic.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append(r"..")
 3 | from utils.util import load_set_from_txt
 4 | _PolityToxic_ = "utils/political_toxic.txt"
 5 | 
 6 | 
 7 | from flashtext import KeywordProcessor
 8 | 
 9 | class CheckToxicWords():
10 |     def __init__(self):
11 |         '''
12 |         self.political_words_set = load_set_from_txt(_PolityPersons_)
13 |         self.sex_words = load_set_from_txt(_Sex_words_)
14 |         self.ad_words = load_set_from_txt(_Ad_words_)
15 |         '''
16 | 
17 |         self.processor = KeywordProcessor()
18 |         self.processor.add_keyword_from_file(_PolityToxic_)
19 | 
20 |         '''
21 |         aa = keyword_processor.extract_keywords('周杰伦是歌星在吉林大路开演唱会，导演国内有冯小刚，苏有朋演的是五阿哥，他现在居住在北京')
22 |         print(aa)
23 |         运行结果：
24 |         ['明星', '路名', '明星', '明星', '地名']
25 |         '''
26 | 
27 |     def is_toxic_text(self,text,thresh_hold=1):
28 |         res = self.processor.extract_keywords(text)
29 |         # ["politician","badword","gumble","sex","ads","dirty"]
30 |         if len(res) >= thresh_hold: return True,"_".join(res)
31 |         return False,""
32 | 
33 |     def checking_political_words(self,text):
34 |         pass
35 | 
36 |     def checking_sex_words(self,text):
37 |         pass
38 | 
39 |     def checking_ad_words(self,text):
40 |         pass
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/env_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Function `strtobool` copied and adapted from `distutils` (as deprected
16 | # in Python 3.10).
17 | # Reference: https://github.com/python/cpython/blob/48f9d3e3faec5faaa4f7c9849fecd27eae4da213/Lib/distutils/util.py#L308-L321
18 | 
19 | 
20 | def strtobool(val: str) -> bool:
21 |     """Convert a string representation of truth to True or False booleans.
22 | 
23 |     True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
24 |     are 'n', 'no', 'f', 'false', 'off', and '0'.
25 | 
26 |     Raises:
27 |         ValueError: if 'val' is anything else.
28 |     """
29 |     val = val.lower()
30 |     if val in ("y", "yes", "t", "true", "on", "1"):
31 |         return True
32 |     if val in ("n", "no", "f", "false", "off", "0"):
33 |         return False
34 |     raise ValueError(f"Invalid truth value, it should be a string but {val} was provided instead.")
35 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/hello_world.py:
--------------------------------------------------------------------------------
 1 | # 0. imports
 2 | import torch
 3 | from transformers import GPT2Tokenizer
 4 | 
 5 | from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer
 6 | 
 7 | 
 8 | # 1. load a pretrained model
 9 | model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
10 | model_ref = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
11 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
12 | tokenizer.pad_token = tokenizer.eos_token
13 | 
14 | # 2. initialize trainer
15 | ppo_config = {"batch_size": 1}
16 | config = PPOConfig(**ppo_config)
17 | ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)
18 | 
19 | # 3. encode a query
20 | query_txt = "This morning I went to the "
21 | query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device)
22 | 
23 | # 4. generate model response
24 | generation_kwargs = {
25 |     "min_length": -1,
26 |     "top_k": 0.0,
27 |     "top_p": 1.0,
28 |     "do_sample": True,
29 |     "pad_token_id": tokenizer.eos_token_id,
30 |     "max_new_tokens": 20,
31 | }
32 | response_tensor = ppo_trainer.generate([item for item in query_tensor], return_prompt=False, **generation_kwargs)
33 | response_txt = tokenizer.decode(response_tensor[0])
34 | 
35 | # 5. define a reward for response
36 | # (this could be any reward such as human feedback or output from another model)
37 | reward = [torch.tensor(1.0, device=model.pretrained_model.device)]
38 | 
39 | # 6. train model with ppo
40 | train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)
41 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/xllm/dpo_win_state.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | import time
 5 | from tqdm import tqdm
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument(
10 |         "--input_file",
11 |         type=str,
12 |         default="./dpo_ckpt_70b_32k_v6-sft-Llama3-70b_final_4epoch-frontis.json",
13 |         help=""
14 |     )
15 |     parser.add_argument(
16 |         "--win_var",
17 |         type=int,
18 |         default=4,
19 |         help=""
20 |     )
21 |     args = parser.parse_args()
22 |     
23 |     js_dict_list = json.load(open(args.input_file))
24 |     total_win_var_num = 0
25 |     total_loss_var_num = 0
26 |     for idx,js_dict in tqdm(enumerate(js_dict_list),total=len(js_dict_list)):
27 |         score_list = js_dict['score']
28 |         dpo_score = int(score_list[0])
29 |         sft_core = int(score_list[1])
30 | 
31 |         if dpo_score >= 10 - args.win_var and sft_core <= args.win_var:
32 |             total_win_var_num += 1
33 |         if sft_core >= 10 - args.win_var and dpo_score <= args.win_var:
34 |             total_loss_var_num += 1
35 |     print("total numbers of DPO eval dataset: ", len(js_dict_list))
36 |     print("total significantly improved instances that below {} score: {}".format(args.win_var,total_win_var_num))
37 |     print("total significantly declined instances that below {} score: {}".format(args.win_var,total_loss_var_num))
38 |     print("significantly improved ratio: {}%".format(100.0*total_win_var_num/len(js_dict_list)))
39 |     print("significantly declined ratio: {}%".format(100.0*total_loss_var_num/len(js_dict_list)))
40 | 
41 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/tests/test_core.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import unittest
15 | 
16 | import torch
17 | 
18 | from trl.core import masked_mean, masked_var, masked_whiten, whiten
19 | 
20 | 
21 | class CoreTester(unittest.TestCase):
22 |     """
23 |     A wrapper class for testing core utils functions
24 |     """
25 | 
26 |     @classmethod
27 |     def setUpClass(cls):
28 |         cls.test_input = torch.Tensor([1, 2, 3, 4])
29 |         cls.test_mask = torch.Tensor([0, 1, 1, 0])
30 |         cls.test_input_unmasked = cls.test_input[1:3]
31 | 
32 |     def test_masked_mean(self):
33 |         self.assertEqual(torch.mean(self.test_input_unmasked), masked_mean(self.test_input, self.test_mask))
34 | 
35 |     def test_masked_var(self):
36 |         self.assertEqual(torch.var(self.test_input_unmasked), masked_var(self.test_input, self.test_mask))
37 | 
38 |     def test_masked_whiten(self):
39 |         whiten_unmasked = whiten(self.test_input_unmasked)
40 |         whiten_masked = masked_whiten(self.test_input, self.test_mask)[1:3]
41 |         diffs = (whiten_unmasked - whiten_masked).sum()
42 |         self.assertAlmostEqual(diffs, 0)
43 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/research_projects/stack_llama_2/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # DPO pipeline for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | Install all the dependencies in the `requirements.txt`:
 6 | 
 7 | ```
 8 | $ pip install -U -r requirements.txt
 9 | ```
10 | 
11 | Since we will use `accelerate` for training, make sure to run:
12 | ```
13 | $ accelerate config
14 | ```
15 | 
16 | ## Training
17 | 
18 | There were two main steps to the DPO training process:
19 | 1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se:
20 |     - `accelerate launch examples/stack_llama_2/scripts/sft_llama2.py --training_args.output_dir="sft"`
21 | 1. Run the DPO trainer using the model saved by the previous step:
22 |     - `accelerate launch examples/stack_llama_2/scripts/dpo_llama2.py --model_name_or_path="sft/final_checkpoint" --output_dir="dpo"`
23 | 
24 | 
25 | ## Merging the adaptors
26 | 
27 | To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:
28 | 
29 | ```
30 | python trl/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="dpo/final_checkpoint/" --output_name="stack-llama-2"
31 | ```
32 | 
33 | which will also push the model to your HuggingFace hub account.
34 | 
35 | ## Running the model
36 | 
37 | We can load the DPO-trained LoRA adaptors which were saved by the DPO training step and load them via:
38 | 
39 | ```py
40 | from peft import AutoPeftModelForCausalLM
41 | 
42 | 
43 | model = AutoPeftModelForCausalLM.from_pretrained(
44 |     "dpo/final_checkpoint",
45 |     low_cpu_mem_usage=True,
46 |     torch_dtype=torch.float16,
47 |     load_in_4bit=True,
48 | )
49 | 
50 | model.generate(...)
51 | ```
52 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/run_data_cleaning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dataset="cn-long_context-rewrite"
 4 | clearning_version="v9"
 5 | 
 6 | source_dir="/localdisk/llm/source_data/${dataset}"
 7 | dest_dir="/localdisk/llm/clean_data/${dataset}/${clearning_version}"
 8 | num_workers=32
 9 | 
10 | # Step1: Perform dataset cleaning
11 | python clean/wechat_clean.py \
12 |     --num_workers ${num_workers} \
13 |     --dataset_name ${dataset} \
14 |     --source_path ${source_dir} \
15 |     --dest_path ${dest_dir}
16 | if [ $? -ne 0 ]; then
17 |     echo "${dataset}_clean.py failed."
18 |     exit
19 | else
20 |     echo "${dataset}_clean.py succeed."
21 | fi
22 | <<EOF
23 | # Step2: deduplication amoung texts 
24 | python text-dedup/text_dedup/minhash.py \
25 |     --path ${source_dir} \
26 |     --name ${dataset} \
27 |     --output ${dest_dir} \
28 |     --column content
29 | EOF
30 | 
31 | # Step2: Make tokenizing with Llama3.1-8B-Instruct, to yield ${dataset}-meta-info.json
32 | tokenizer_path="/data/models/Meta-Llama-3.1-8B-Instruct/"
33 | python utils/tokenizer.py \
34 |     --dataset_name ${dataset} \
35 |     --dataset_path ${dest_dir}/good \
36 |     --output_path ${dest_dir} \
37 |     --tokenizer_path ${tokenizer_path} \
38 |     --version ${clearning_version} \
39 |     --num_workers ${num_workers}
40 | if [ $? -ne 0 ]; then
41 |     echo "tokenizer.py failed."
42 |     exit
43 | else
44 |     echo "tokenizer.py succeed."
45 | fi
46 | 
47 | # Step3: Sample 100 datas for evaluation, to produce ${dataset}-sample100.jsonl
48 | python utils/random_sample.py \
49 |     --dataset_name ${dataset} \
50 |     --dataset_path ${dest_dir}/good \
51 |     --output_path ${dest_dir} \
52 |     --number_sample 100 \
53 |     --version ${clearning_version}
54 | if [ $? -ne 0 ]; then
55 |     echo "random_sample.py failed."
56 |     exit
57 | else
58 |     echo "random_sample.py succeed."
59 | fi
60 | 
61 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/research_projects/stack_llama/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # RLHF pipeline for the creation of StackLLaMa: a Stack exchange llama-7b model.
 2 | There were three main steps to the training process:
 3 | 1. Supervised fine-tuning of the base llama-7b model to create llama-7b-se:
 4 |     - `torchrun --nnodes 1  --nproc_per_node 8 examples/stack_llama/scripts/supervised_finetuning.py --model_path=<LLAMA_MODEL_PATH> --streaming --no_gradient_checkpointing --learning_rate 1e-5 --max_steps 5000 --output_dir ./llama-se`
 5 | 2. Reward modeling using dialog pairs from the SE dataset using the llama-7b-se to create llama-7b-se-rm:
 6 |     - `torchrun --nnodes 1  --nproc_per_node 8 examples/stack_llama/scripts/reward_modeling.py --model_name=<LLAMA_SE_MODEL>`
 7 | 3. RL fine-tuning of llama-7b-se with the llama-7b-se-rm reward model:
 8 |     - `accelerate launch --multi_gpu --num_machines 1  --num_processes 8 examples/stack_llama/scripts/rl_training.py --log_with=wandb --model_name=<LLAMA_SE_MODEL> --reward_model_name=<LLAMA_SE_RM_MODEL> --adafactor=False --tokenizer_name=<LLAMA_TOKENIZER> --save_freq=100 --output_max_length=128 --batch_size=8 --gradient_accumulation_steps=8 --batched_gen=True --ppo_epochs=4 --seed=0 --learning_rate=1.4e-5 --early_stopping=True --output_dir=llama-se-rl-finetune-128-8-8-1.4e-5_adam`
 9 | 
10 | 
11 | LoRA layers were using at all stages to reduce memory requirements. 
12 | At each stage the peft adapter layers were merged with the base model, using: 
13 | ```shell
14 | python examples/stack_llama/scripts/merge_peft_adapter.py --adapter_model_name=XXX --base_model_name=YYY --output_name=ZZZ
15 | ```
16 | Note that this script requires `peft>=0.3.0`.
17 | 
18 | For access to the base llama-7b model, please see Meta's [release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) and [request form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform).
19 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from huggingface_hub import PyTorchModelHubMixin
16 | 
17 | 
18 | class BaseTrainer(PyTorchModelHubMixin):
19 |     r"""
20 |     Base class for all trainers - this base class implements the basic functions that we
21 |     need for a trainer.
22 | 
23 |     The trainer needs to have the following functions:
24 |         - step: takes in a batch of data and performs a step of training
25 |         - loss: takes in a batch of data and returns the loss
26 |         - compute_rewards: takes in a batch of data and returns the rewards
27 |         - _build_models_and_tokenizer: builds the models and tokenizer
28 |         - _build_dataset: builds the dataset
29 |     Each user is expected to implement their own trainer class that inherits from this base
30 |     if they want to use a new training algorithm.
31 |     """
32 | 
33 |     def __init__(self, config):
34 |         self.config = config
35 | 
36 |     def step(self, *args):
37 |         raise NotImplementedError("Not implemented")
38 | 
39 |     def loss(self, *args):
40 |         raise NotImplementedError("Not implemented")
41 | 
42 |     def compute_rewards(self, *args):
43 |         raise NotImplementedError("Not implemented")
44 | 
45 |     def _save_pretrained(self, save_directory):
46 |         raise NotImplementedError("Not implemented")
47 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/opencc_t2s.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import re
 4 | from tqdm import tqdm
 5 | import opencc
 6 | import argparse
 7 | from tqdm import tqdm
 8 | from os import listdir, path
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--source_path',
13 |                         type=str,
14 |                         default="/data/data_warehouse/llm/llm-data-org.del/cn-wiki2",
15 |                         help='Directory containing trained actor model')
16 |     parser.add_argument('--dest_path',
17 |                         type=str,
18 |                         default="/data/data_warehouse/llm/llm-data-org.del/",
19 |                         help='Directory containing trained actor model')
20 | 
21 |     args = parser.parse_args()
22 |     return args
23 | 
24 | 
25 | def split_cn_wiki(args):
26 |     files = sorted(listdir(args.source_path))
27 | 
28 |     WikiDir = os.path.join(args.dest_path, "cn-wiki2_t2s")
29 |     if not os.path.exists(WikiDir):
30 |         os.makedirs(WikiDir, exist_ok=True)
31 | 
32 |     converter = opencc.OpenCC('t2s.json')
33 | 
34 |     for input_file in tqdm(files,total=len(files)):
35 | 
36 |         ifile = os.path.join(args.source_path,input_file)
37 | 
38 |         wiki_output_file = os.path.join(WikiDir,input_file)
39 |         if os.path.exists(wiki_output_file): os.remove(wiki_output_file)
40 |         wiki_fo = open(wiki_output_file, 'a+', encoding='utf-8')
41 | 
42 |         for line in open(ifile,'r',encoding="utf-8"):
43 |             line = line.strip()
44 |             if len(line) < 5:continue
45 |             js_dict = json.loads(line)
46 |             content = converter.convert(js_dict["content"])
47 |             js_dict["content"] = content
48 |             jstr = json.dumps(js_dict, ensure_ascii=False)
49 |             wiki_fo.write(jstr+"\n")
50 |         wiki_fo.close()
51 | 
52 | if __name__ == '__main__':
53 | 
54 |     args = parse_args() 
55 |     split_cn_wiki(args)
56 | 
57 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/ray_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import ray
 3 | import argparse
 4 | 
 5 | def change_extension(filename, new_extension):
 6 |     base_name = os.path.splitext(filename)[0]  # 获取文件名（不包含扩展名）
 7 |     new_filename = f"{base_name}.{new_extension}"  # 构建新的文件名
 8 |     return new_filename
 9 | 
10 | def process_files_extension(folder_path, raw_extension, new_extension):
11 |     for filename in os.listdir(folder_path):
12 |         if filename.endswith(f".{raw_extension}"):
13 |             old_path = os.path.join(folder_path, filename)
14 |             new_filename = change_extension(filename, f"{new_extension}")  # 修改后缀为"modified"
15 |             new_path = os.path.join(folder_path, new_filename)
16 |             os.rename(old_path, new_path)
17 |             print(f"重命名文件：{filename} -> {new_filename}")
18 | 
19 | def test_folder(dest_path):
20 |     if not os.path.exists(dest_path):
21 |         os.makedirs(dest_path, exist_ok=True)
22 |     GoodDir = os.path.join(dest_path, "good")
23 |     BadDir = os.path.join(dest_path, "bad")
24 | 
25 |     if not os.path.exists(GoodDir):
26 |         os.makedirs(GoodDir, exist_ok=True)
27 |     if not os.path.exists(BadDir):
28 |         os.makedirs(BadDir,exist_ok=True)
29 |     
30 |     return GoodDir, BadDir
31 | 
32 | def parse_args():
33 |     parser = argparse.ArgumentParser()
34 |     parser.add_argument('--source_path',
35 |                         type=str,
36 |                         default="/data/datacleansing/test",
37 |                         help='Directory containing trained actor model')
38 |     parser.add_argument('--dest_path',
39 |                         type=str,
40 |                         default="/data/datacleansing/test_store",
41 |                         help='Directory containing trained actor model')
42 |     parser.add_argument('--dataset_name',
43 |                         type=str,
44 |                         default="",
45 |                         help="")
46 |     args = parser.parse_args()
47 |     return args
48 | 
49 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/reward_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | from typing import Optional
17 | 
18 | from transformers import TrainingArguments
19 | 
20 | 
21 | @dataclass
22 | class RewardConfig(TrainingArguments):
23 |     """
24 |     RewardConfig collects all training arguments related to the [`RewardTrainer`] class.
25 | 
26 |     Using [`HfArgumentParser`] we can turn this class into
27 |     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
28 |     command line.
29 | 
30 |     Parameters:
31 |         max_length (`int`, *optional*, defaults to `None`):
32 |             The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
33 |         gradient_checkpointing (`bool`, *optional*, defaults to `True`):
34 |                 If True, use gradient checkpointing to save memory at the expense of slower backward pass.
35 |     """
36 | 
37 |     max_length: Optional[int] = None
38 |     """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator."""
39 |     dataset_num_proc: Optional[int] = None
40 |     """Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`."""
41 |     center_rewards_coefficient: Optional[float] = None
42 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/training_configs.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # coding=utf-8
 3 | # Copyright 2023 The HuggingFace Team. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | from dataclasses import dataclass
17 | from typing import Optional
18 | 
19 | from transformers import TrainingArguments
20 | 
21 | 
22 | @dataclass
23 | class RewardConfig(TrainingArguments):
24 |     """
25 |     RewardConfig collects all training arguments related to the [`RewardTrainer`] class.
26 | 
27 |     Using [`HfArgumentParser`] we can turn this class into
28 |     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
29 |     command line.
30 | 
31 |     Parameters:
32 |         max_length (`int`, *optional*, defaults to `None`):
33 |             The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
34 |         gradient_checkpointing (`bool`, *optional*, defaults to `True`):
35 |                 If True, use gradient checkpointing to save memory at the expense of slower backward pass.
36 |     """
37 | 
38 |     max_length: Optional[int] = None
39 |     """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator."""
40 |     gradient_checkpointing: Optional[bool] = True
41 |     """If True, use gradient checkpointing to save memory at the expense of slower backward pass."""
42 |     gradient_checkpointing_kwargs: Optional[dict] = None
43 |     """Keyword arguments to pass to the gradient checkpointing function."""
44 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | import torch
 5 | from peft import PeftConfig, PeftModel
 6 | from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser
 7 | 
 8 | 
 9 | @dataclass
10 | class ScriptArguments:
11 |     """
12 |     The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the
13 |     merged model.
14 |     """
15 | 
16 |     adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"})
17 |     base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"})
18 |     output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"})
19 | 
20 | 
21 | parser = HfArgumentParser(ScriptArguments)
22 | script_args = parser.parse_args_into_dataclasses()[0]
23 | assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge"
24 | assert script_args.base_model_name is not None, "please provide the name of the Base model"
25 | assert script_args.output_name is not None, "please provide the output name of the merged model"
26 | 
27 | peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name)
28 | if peft_config.task_type == "SEQ_CLS":
29 |     # The sequence classification task is used for the reward model in PPO
30 |     model = AutoModelForSequenceClassification.from_pretrained(
31 |         script_args.base_model_name, num_labels=1, torch_dtype=torch.bfloat16
32 |     )
33 | else:
34 |     model = AutoModelForCausalLM.from_pretrained(
35 |         script_args.base_model_name, return_dict=True, torch_dtype=torch.bfloat16
36 |     )
37 | 
38 | tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name)
39 | 
40 | # Load the PEFT model
41 | model = PeftModel.from_pretrained(model, script_args.adapter_model_name)
42 | model.eval()
43 | 
44 | model = model.merge_and_unload()
45 | 
46 | model.save_pretrained(f"{script_args.output_name}")
47 | tokenizer.save_pretrained(f"{script_args.output_name}")
48 | model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False)
49 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | 
 3 | # Copyright 2022 The HuggingFace Team. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # flake8: noqa
17 | 
18 | from typing import TYPE_CHECKING
19 | from ..import_utils import _LazyModule, is_diffusers_available, OptionalDependencyNotAvailable
20 | 
21 | 
22 | _import_structure = {
23 |     "modeling_base": ["PreTrainedModelWrapper", "create_reference_model"],
24 |     "modeling_value_head": [
25 |         "AutoModelForCausalLMWithValueHead",
26 |         "AutoModelForSeq2SeqLMWithValueHead",
27 |     ],
28 |     "utils": ["setup_chat_format", "SUPPORTED_ARCHITECTURES", "unwrap_model_for_generation"],
29 | }
30 | 
31 | try:
32 |     if not is_diffusers_available():
33 |         raise OptionalDependencyNotAvailable()
34 | except OptionalDependencyNotAvailable:
35 |     pass
36 | else:
37 |     _import_structure["modeling_sd_base"] = [
38 |         "DDPOPipelineOutput",
39 |         "DDPOSchedulerOutput",
40 |         "DDPOStableDiffusionPipeline",
41 |         "DefaultDDPOStableDiffusionPipeline",
42 |     ]
43 | 
44 | if TYPE_CHECKING:
45 |     from .modeling_base import PreTrainedModelWrapper, create_reference_model
46 |     from .modeling_value_head import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead
47 |     from .utils import setup_chat_format, SUPPORTED_ARCHITECTURES
48 | 
49 |     try:
50 |         if not is_diffusers_available():
51 |             raise OptionalDependencyNotAvailable()
52 |     except OptionalDependencyNotAvailable:
53 |         pass
54 |     else:
55 |         from .modeling_sd_base import (
56 |             DDPOPipelineOutput,
57 |             DDPOSchedulerOutput,
58 |             DDPOStableDiffusionPipeline,
59 |             DefaultDDPOStableDiffusionPipeline,
60 |         )
61 | else:
62 |     import sys
63 | 
64 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
65 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/split_bigdata_file.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import gzip
 4 | import argparse
 5 | import chardet
 6 | from tqdm import tqdm
 7 | from os import listdir, path
 8 | 
 9 | def split(args):
10 |     global_file_no = 0
11 |     global_id_no = 0
12 | 
13 |     dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
14 |     if os.path.exists(dest_file): os.remove(dest_file)
15 |     of = open(dest_file,'w',encoding='utf-8')
16 | 
17 |     subsets = sorted(listdir(args.source_path))
18 |     for dir_no,file_name in tqdm(enumerate(subsets),total=len(subsets)):
19 |        
20 |         input_file = os.path.join(args.source_path,file_name)
21 |         with open(input_file, 'r',encoding='utf-8') as f:
22 |             for line in f:
23 |                 line = line.strip()
24 |                 if len(line) < 1:continue
25 |                 js_dict = json.loads(line)
26 |                 #js_dict["id"] = js_dict["note_id"]
27 |                 #del js_dict["note_id"] 
28 |                 print(json.dumps(js_dict,ensure_ascii=False),file=of)
29 |                 if of.tell() > args.max_size:
30 |                     of.close()
31 |                     dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
32 |                     if os.path.exists(dest_file): os.remove(dest_file)
33 |                     of = open(dest_file,'w',encoding='utf-8')
34 |                     global_file_no += 1
35 |     of.close()
36 | 
37 | 
38 | def parse_args():
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument('--source_path',
41 |                         type=str,
42 |                         default="/data/tianqingxiang/data/llm/ocr/ocr_infer_result/200W",
43 |                         help='Directory containing trained actor model')
44 |     parser.add_argument('--dest_path',
45 |                         type=str,
46 |                         default="/root/llm/source_data/cn-JD-ocrtext",
47 |                         help='Directory containing trained actor model')
48 |     parser.add_argument('--dataset_name',
49 |                         type=str,
50 |                         default="cn-JD-ocrtext",
51 |                         help="")
52 |     parser.add_argument('--max_size',
53 |                         type=int,
54 |                         default=200*1024*1024,
55 |                         help="max chunk size")
56 |     args = parser.parse_args()
57 |     return args
58 | 
59 | if __name__ == "__main__":
60 |     args = parse_args()
61 | 
62 |     if not os.path.exists(args.dest_path):
63 |         os.makedirs(args.dest_path, exist_ok=True)
64 |     split(args)
65 | 
66 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/commands/cli.py:
--------------------------------------------------------------------------------
 1 | # This file is a copy of trl/examples/scripts/sft.py so that we could
 2 | # use it together with rich and the TRL CLI in a more customizable manner.
 3 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | import os
17 | import subprocess
18 | import sys
19 | from subprocess import CalledProcessError
20 | 
21 | from rich.console import Console
22 | 
23 | 
24 | SUPPORTED_COMMANDS = ["sft", "dpo", "chat"]
25 | 
26 | 
27 | def main():
28 |     console = Console()
29 |     # Make sure to import things locally to avoid verbose from third party libs.
30 |     with console.status("[bold purple]Welcome! Initializing the TRL CLI..."):
31 |         from trl.commands.cli_utils import init_zero_verbose
32 | 
33 |         init_zero_verbose()
34 | 
35 |         command_name = sys.argv[1]
36 | 
37 |         if command_name not in SUPPORTED_COMMANDS:
38 |             raise ValueError(
39 |                 f"Please use one of the supported commands, got {command_name} - supported commands are {SUPPORTED_COMMANDS}"
40 |             )
41 | 
42 |         trl_examples_dir = os.path.dirname(__file__)
43 | 
44 |     # Force-use rich if the `TRL_USE_RICH` env var is not set
45 |     if "TRL_USE_RICH" not in os.environ:
46 |         os.environ["TRL_USE_RICH"] = "1"
47 | 
48 |     if command_name == "chat":
49 |         command = f"""
50 |         python {trl_examples_dir}/scripts/{command_name}.py {" ".join(sys.argv[2:])}
51 |         """
52 |     else:
53 |         command = f"""
54 |         accelerate launch {trl_examples_dir}/scripts/{command_name}.py {" ".join(sys.argv[2:])}
55 |         """
56 | 
57 |     try:
58 |         subprocess.run(
59 |             command.split(),
60 |             text=True,
61 |             check=True,
62 |             encoding="utf-8",
63 |             cwd=os.getcwd(),
64 |             env=os.environ.copy(),
65 |         )
66 |     except (CalledProcessError, ChildProcessError) as exc:
67 |         console.log(f"TRL - {command_name.upper()} failed on ! See the logs above for further details.")
68 |         raise ValueError("TRL CLI failed! Check the traceback above..") from exc
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     main()
73 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/random_sample.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import os
 3 | import glob
 4 | import re
 5 | import math
 6 | import json
 7 | import argparse
 8 | import random
 9 | from tqdm import tqdm
10 | import hashlib
11 | 
12 | 
13 | def random_sample_benchmark(sample_num,input_dir):
14 |     
15 |     files = sorted(glob.glob(os.path.join(input_dir,"*.jsonl"), recursive=True))
16 |     
17 |     avg_nums_per_task = math.ceil(sample_num/len(files))
18 | 
19 |     sample_1000 = []
20 |     for file in tqdm(files,total=len(files)):
21 |         filename = os.path.basename(file).replace(".jsonl","")
22 |         data = []
23 |         for line in open(file,"r",encoding='utf-8'):
24 |             try:
25 |                 js = json.loads(line.strip())
26 |             except json.decoder.JSONDecodeError:
27 |                 print(line)
28 |             data.append(js)
29 |         random.shuffle(data)
30 |         print(f"process file {filename}, total of {len(data)}.")
31 |         sample_1000.extend(data[0:avg_nums_per_task])       
32 |     return sample_1000
33 | 
34 | def parse_args():
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument('--dataset_name',
37 |                         type=str,
38 |                         default="xhs",
39 |                         help='dataset name')
40 |     parser.add_argument('--dataset_path',
41 |                         type=str,
42 |                         default="/yuan1.0/open_source_1T",
43 |                         help='source path')
44 |     parser.add_argument('--output_path',
45 |                         type=str,
46 |                         default="./",
47 |                         help='source path')
48 | 
49 |     parser.add_argument("--number_sample",
50 |         type=int,
51 |         default=100,
52 |         help="number of sampled data"
53 |     )
54 |     parser.add_argument('--version',
55 |                         type=str,
56 |                         default="v1",
57 |                         help=""
58 |     )
59 |     args = parser.parse_args()
60 |     return args
61 | 
62 | if __name__ == '__main__':
63 | 
64 |     args = parse_args()
65 | 
66 |     sample_benchmark = []
67 | 
68 |     sample_benchmark = random_sample_benchmark(
69 |         sample_num=args.number_sample,
70 |         input_dir=args.dataset_path
71 |     )
72 |     print(f"sampling benchmark questions: {len(sample_benchmark)}")
73 | 
74 |     output_file=f"{args.output_path}/{args.dataset_name}-sample{args.number_sample}-{args.version}.jsonl"
75 |     if os.path.exists(output_file): os.remove(output_file)
76 |     fo = open(output_file, 'w', encoding='utf-8')
77 | 
78 |     for idx, item in tqdm(enumerate(sample_benchmark),total=len(sample_benchmark)):
79 |         item["id"] = idx + 1
80 |         jstr = json.dumps(item, ensure_ascii=False)
81 |         fo.write(jstr+"\n")
82 |     fo.close()
83 |     print(f"Output file {output_file}, total sampled {len(sample_benchmark)}")
84 | 
85 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-baidu_weixin-checkpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import chardet
 5 | from tqdm import tqdm
 6 | from os import listdir, path
 7 | 
 8 | 
 9 | def make_clean(args):
10 | 
11 |     global_file_no = 0
12 |     global_id_no = 0
13 | 
14 | 
15 |     dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
16 |     if os.path.exists(dest_file): os.remove(dest_file)
17 |     global_file_no += 1
18 |     of = open(dest_file,'w',encoding='utf-8')
19 | 
20 |     subfiles = sorted(listdir(args.source_path))
21 |     for dir_no,subfile in tqdm(enumerate(subfiles),total=len(subfiles)):
22 |         
23 |         input_file = os.path.join(args.source_path,subfile)
24 | 
25 |         with open(input_file, 'r') as f:
26 |             datalist = f.readlines()
27 | 
28 |         for line in datalist:
29 |             line = line.strip()
30 |             if len(line) < 1:
31 |                 continue
32 | 
33 |             js_data = json.loads(line)
34 |             js_dict = {}
35 |             js_dict["id"] = global_id_no
36 |             js_dict["source"] = "cn-baidu-weixin"
37 |             js_dict["source_id"] = js_data['url']
38 |             js_dict["subset"] = js_data["search_keyword"]
39 |             js_dict["content"] = js_data["content"]
40 |             global_id_no += 1
41 | 
42 |             print(json.dumps(js_dict,ensure_ascii=False),file=of)
43 |             if of.tell() > args.max_size:
44 |                 of.close()
45 |                 dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
46 |                 if os.path.exists(dest_file): os.remove(dest_file)
47 |                 of = open(dest_file,'w',encoding='utf-8')
48 |                 global_file_no += 1
49 |     of.close()
50 | 
51 | def parse_args():
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument('--source_path',
54 |                         type=str,
55 |                         default="/data/data_warehouse/SourceData/baidu_weixin/231027/",
56 |                         help='Directory containing trained actor model')
57 |     parser.add_argument('--dest_path',
58 |                         type=str,
59 |                         default="/localdisk/llm/source_data/cn-baidu-weixin",
60 |                         help='Directory containing trained actor model')
61 |     parser.add_argument('--dataset_name',
62 |                         type=str,
63 |                         default="cn-baidu-weixin",
64 |                         help="")
65 |     parser.add_argument('--max_size',
66 |                         type=int,
67 |                         default=200 * 1024 * 1024,
68 |                         help="max chunk size")
69 |     args = parser.parse_args()
70 |     return args
71 | 
72 | if __name__ == "__main__":
73 |     args = parse_args()
74 | 
75 |     if not os.path.exists(args.dest_path):
76 |         os.makedirs(args.dest_path, exist_ok=True)
77 |     make_clean(args)
78 | 
79 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/scripts/postrain_with_dpo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | log_out=0
 5 | only_print=0
 6 | dist_only_print=0
 7 | enable_flash_attn="True"
 8 | tie_embed="False"
 9 | 
10 | datestr=`date +"%Y-%m-%d"`
11 | wandb_run_name="dpo-sftExp8.3-Qwen1.5-14B-cp1006-$datestr"
12 | 
13 | output_path=/mnt/lptest/sftExp8.3-Qwen1.5-14B-checkpoint-1006-post
14 | ckpt_path=/mnt/lptest/sftExp8.3-Qwen1.5-14B/sftExp8.3-Qwen1.5-14B-checkpoint-1006
15 | model_type="Qwen"
16 | 
17 | pd_token="3xxxx"
18 | data_suffix="*.jsonl"
19 | train_data_path=/mnt/lptest/xubu/dpo_dataGen/dpo_preference_data/train
20 | dev_data_path=/mnt/lptest/xubu/dpo_dataGen/dpo_preference_data/dev
21 | 
22 | num_processes=32
23 | beta=0.1
24 | bs_per_dev=2
25 | grad_acc_steps=2
26 | 
27 | # save model per 500 global_step (2B token, 3h)
28 | ckpt_steps=281
29 | eval_steps=281
30 | 
31 | # Direct Preference Optimization
32 | train_epoch=3
33 | lr=5e-6
34 | warmup_ratio=0.02
35 | 
36 | max_length=2048
37 | max_prompt_length=1024
38 | max_target_length=1024
39 | 
40 | strategy=zero3
41 | sanity_check=False
42 | 
43 | # Run Command
44 | REPO=$(pwd)
45 | config=$REPO/scripts/accelerate_configs/deepspeed_${strategy}.yaml
46 | echo "config file: $config"
47 | 
48 | CMD=""
49 | 
50 | CMD="$CMD PYTHONPATH=$REPO"
51 | CMD="$CMD accelerate launch"
52 | 
53 | CMD="$CMD --num_processes=$num_processes --config_file=$config"
54 | CMD="$CMD $REPO/xllm/postrain.py"
55 | 
56 | CMD="$CMD --bf16 --beta $beta --model_name_or_path $ckpt_path --learning_rate $lr --model_architecture_type $model_type"
57 | CMD="$CMD --per_device_train_batch_size $bs_per_dev --gradient_accumulation_steps $grad_acc_steps"
58 | 
59 | CMD="$CMD --max_length $max_length --max_prompt_length $max_prompt_length --max_target_length $max_target_length"
60 | CMD="$CMD --sanity_check $sanity_check --report_to 'wandb' --run_name $wandb_run_name"
61 | 
62 | # --tie_word_embeddings $tie_embed --enable_flash_attn $enable_flash_attn
63 | CMD="$CMD --ignore_bias_buffers False --logging_steps 1"
64 | # --tie_word_embeddings $tie_embed --enable_flash_attn $enable_flash_attn
65 | 
66 | CMD="$CMD --train_dataset_path $train_data_path --test_dataset_path $dev_data_path --data_suffix $data_suffix"
67 | 
68 | CMD="$CMD --eval_steps $eval_steps --num_train_epochs $train_epoch --warmup_ratio $warmup_ratio"
69 | 
70 | CMD="$CMD --output_dir $output_path --save_steps $ckpt_steps --logging_first_step --no_remove_unused_columns"
71 | 
72 | CMD="$CMD --gradient_checkpointing True --weight_decay 0.1 --max_grad_norm 1.0"
73 | 
74 | 
75 | echo $CMD
76 | printf "===== Running Command =====\n"
77 | printf "\t%s\n\n" "$CMD"
78 | 
79 | if [[ $only_print == "0" ]]; then
80 |     printf "===== Command Logs =====\n"
81 |     if [[ $log_out == "1" ]]; then
82 |         echo "Command is running...."
83 |         echo "Please run [tail -f ${log_file}] in another shell to monitoring the running process."
84 |     fi
85 |     if [[ -d $REPO/logs ]]; then
86 |         timestamp=$(date +"%Y%m%d.%H.%M.%S")
87 |         mv $REPO/logs $REPO/logs.$timestamp
88 |     fi
89 |     mkdir $REPO/logs
90 |     eval "$CMD"
91 | fi
92 | 
93 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-e-txt-checkpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import chardet
 5 | from tqdm import tqdm
 6 | from os import listdir, path
 7 | 
 8 | 
 9 | def make_clean(args):
10 | 
11 |     global_file_no = 0
12 |     global_id_no = 0
13 | 
14 |     dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
15 |     if os.path.exists(dest_file): os.remove(dest_file)
16 |     global_file_no += 1
17 |     of = open(dest_file,'w',encoding='utf-8')
18 | 
19 |     subfiles = sorted(listdir(args.source_path))
20 |     for dir_no,subfile in tqdm(enumerate(subfiles),total=len(subfiles)):
21 |         
22 |         input_file = os.path.join(args.source_path,subfile)
23 |         if not (input_file.endswith(".txt") or input_file.endswith(".shtml")): continue
24 | 
25 |         html_str = open(input_file, 'rb').read()
26 |         encoding_info = chardet.detect(html_str)
27 |         original_encoding = encoding_info['encoding']
28 |         if original_encoding not in ["UTF-8","GB2312","GB18030","Big5","utf-8","UTF-16","UTF-32"]: continue
29 | 
30 |         html_str = html_str.decode(original_encoding, 'ignore')#.encode('utf-8')
31 |         if len(html_str) < 256: continue
32 | 
33 |         js_dict = {}
34 |         js_dict["id"] = global_id_no
35 |         js_dict["source"] = "cn-e-txt"
36 |         js_dict["subset"] = os.path.basename(subfile).replace(".txt","")
37 |         js_dict["source_id"] = ""
38 |         global_id_no += 1
39 | 
40 |         js_dict["content"] = html_str
41 | 
42 |         print(json.dumps(js_dict,ensure_ascii=False),file=of)
43 |         if of.tell() > args.max_size:
44 |             of.close()
45 |             dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
46 |             if os.path.exists(dest_file): os.remove(dest_file)
47 |             of = open(dest_file,'w',encoding='utf-8')
48 |             global_file_no += 1
49 |     of.close()
50 | 
51 | 
52 | def parse_args():
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument('--source_path',
55 |                         type=str,
56 |                         default="/data/data_warehouse/SourceData/txt",
57 |                         help='Directory containing trained actor model')
58 |     parser.add_argument('--dest_path',
59 |                         type=str,
60 |                         default="/localdisk/llm/source_data/cn-e-txt",
61 |                         help='Directory containing trained actor model')
62 |     parser.add_argument('--dataset_name',
63 |                         type=str,
64 |                         default="cn-e-txt",
65 |                         help="")
66 |     parser.add_argument('--max_size',
67 |                         type=int,
68 |                         default=200 * 1024 * 1024,
69 |                         help="max chunk size")
70 |     args = parser.parse_args()
71 |     return args
72 | 
73 | if __name__ == "__main__":
74 |     args = parse_args()
75 | 
76 |     if not os.path.exists(args.dest_path):
77 |         os.makedirs(args.dest_path, exist_ok=True)
78 |     make_clean(args)
79 | 
80 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/tests/testing_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import unittest
15 | 
16 | import torch
17 | 
18 | from trl import is_diffusers_available, is_peft_available, is_wandb_available, is_xpu_available
19 | 
20 | 
21 | def require_peft(test_case):
22 |     """
23 |     Decorator marking a test that requires peft. Skips the test if peft is not available.
24 |     """
25 |     if not is_peft_available():
26 |         test_case = unittest.skip("test requires peft")(test_case)
27 |     return test_case
28 | 
29 | 
30 | def require_diffusers(test_case):
31 |     """
32 |     Decorator marking a test that requires diffusers. Skips the test if diffusers is not available.
33 |     """
34 |     if not is_diffusers_available():
35 |         test_case = unittest.skip("test requires diffusers")(test_case)
36 |     return test_case
37 | 
38 | 
39 | def require_wandb(test_case, required: bool = True):
40 |     """
41 |     Decorator marking a test that requires wandb. Skips the test if wandb is not available.
42 |     """
43 |     # XOR, i.e.:
44 |     # skip if available and required = False and
45 |     # skip if not available and required = True
46 |     if is_wandb_available() ^ required:
47 |         test_case = unittest.skip("test requires wandb")(test_case)
48 |     return test_case
49 | 
50 | 
51 | def require_no_wandb(test_case):
52 |     """
53 |     Decorator marking a test that requires no wandb. Skips the test if wandb is available.
54 |     """
55 |     return require_wandb(test_case, required=False)
56 | 
57 | 
58 | def require_bitsandbytes(test_case):
59 |     """
60 |     Decorator marking a test that requires bitsandbytes. Skips the test if bitsandbytes is not available.
61 |     """
62 |     try:
63 |         import bitsandbytes  # noqa: F401
64 |     except ImportError:
65 |         test_case = unittest.skip("test requires bitsandbytes")(test_case)
66 |     return test_case
67 | 
68 | 
69 | def require_torch_multi_gpu(test_case):
70 |     """
71 |     Decorator marking a test that requires multiple GPUs. Skips the test if there aren't enough GPUs.
72 |     """
73 |     if torch.cuda.device_count() < 2:
74 |         test_case = unittest.skip("test requires multiple GPUs")(test_case)
75 |     return test_case
76 | 
77 | 
78 | def require_torch_multi_xpu(test_case):
79 |     """
80 |     Decorator marking a test that requires multiple XPUs. Skips the test if there aren't enough XPUs.
81 |     """
82 |     if torch.xpu.device_count() < 2 and is_xpu_available():
83 |         test_case = unittest.skip("test requires multiple XPUs")(test_case)
84 |     return test_case
85 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/requirements/pip_dpo_requirements.txt:
--------------------------------------------------------------------------------
  1 | accelerate==0.28.0
  2 | aiohappyeyeballs==2.3.5
  3 | aiohttp==3.10.3
  4 | aiosignal==1.3.1
  5 | annotated-types==0.7.0
  6 | anyio==4.4.0
  7 | async-timeout==4.0.3
  8 | attrs==24.2.0
  9 | cachetools==5.4.0
 10 | certifi==2024.7.4
 11 | charset-normalizer==3.3.2
 12 | click==8.1.7
 13 | cloudpickle==3.0.0
 14 | cmake==3.30.2
 15 | datasets==2.21.0
 16 | deepspeed==0.14.5
 17 | dill==0.3.8
 18 | diskcache==5.6.3
 19 | distro==1.9.0
 20 | docker-pycreds==0.4.0
 21 | docstring_parser==0.16
 22 | einops==0.8.0
 23 | exceptiongroup==1.2.2
 24 | fastapi==0.112.0
 25 | fastchat==0.1.0
 26 | filelock==3.15.4
 27 | flash_attn==2.6.3
 28 | frozenlist==1.4.1
 29 | fsspec==2024.6.1
 30 | gitdb==4.0.11
 31 | GitPython==3.1.43
 32 | h11==0.14.0
 33 | hjson==3.1.0
 34 | httpcore==1.0.5
 35 | httptools==0.6.1
 36 | httpx==0.27.0
 37 | huggingface-hub==0.24.5
 38 | idna==3.7
 39 | interegular==0.3.3
 40 | Jinja2==3.1.4
 41 | jiter==0.5.0
 42 | joblib==1.4.2
 43 | jsonschema==4.23.0
 44 | jsonschema-specifications==2023.12.1
 45 | lark==1.2.2
 46 | llvmlite==0.43.0
 47 | lm-format-enforcer==0.10.1
 48 | loguru==0.7.2
 49 | markdown-it-py==3.0.0
 50 | MarkupSafe==2.1.5
 51 | mdurl==0.1.2
 52 | mpmath==1.3.0
 53 | msgpack==1.0.8
 54 | multidict==6.0.5
 55 | multiprocess==0.70.16
 56 | nest-asyncio==1.6.0
 57 | networkx==3.3
 58 | ninja==1.11.1.1
 59 | numba==0.60.0
 60 | numpy==1.26.4
 61 | nvidia-cublas-cu12==12.1.3.1
 62 | nvidia-cuda-cupti-cu12==12.1.105
 63 | nvidia-cuda-nvrtc-cu12==12.1.105
 64 | nvidia-cuda-runtime-cu12==12.1.105
 65 | nvidia-cudnn-cu12==8.9.2.26
 66 | nvidia-cufft-cu12==11.0.2.54
 67 | nvidia-curand-cu12==10.3.2.106
 68 | nvidia-cusolver-cu12==11.4.5.107
 69 | nvidia-cusparse-cu12==12.1.0.106
 70 | nvidia-ml-py==12.535.161
 71 | nvidia-nccl-cu12==2.20.5
 72 | nvidia-nvjitlink-cu12==12.6.20
 73 | nvidia-nvtx-cu12==12.1.105
 74 | nvitop==1.3.2
 75 | openai==1.40.6
 76 | outlines==0.0.34
 77 | packaging==24.1
 78 | pandas==2.2.2
 79 | platformdirs==4.2.2
 80 | prometheus-fastapi-instrumentator==7.0.0
 81 | prometheus_client==0.20.0
 82 | protobuf==5.27.3
 83 | psutil==6.0.0
 84 | py-cpuinfo==9.0.0
 85 | pyarrow==17.0.0
 86 | pycryptodome==3.20.0
 87 | pydantic==2.8.2
 88 | pydantic_core==2.20.1
 89 | Pygments==2.18.0
 90 | python-dateutil==2.9.0.post0
 91 | python-dotenv==1.0.1
 92 | pytz==2024.1
 93 | PyYAML==6.0.2
 94 | ray==2.34.0
 95 | referencing==0.35.1
 96 | regex==2024.7.24
 97 | requests==2.32.3
 98 | rich==13.7.1
 99 | rpds-py==0.20.0
100 | safetensors==0.4.4
101 | scipy==1.14.0
102 | sentencepiece==0.2.0
103 | sentry-sdk==2.13.0
104 | setproctitle==1.3.3
105 | shtab==1.7.1
106 | six==1.16.0
107 | smmap==5.0.1
108 | sniffio==1.3.1
109 | starlette==0.37.2
110 | sympy==1.13.2
111 | termcolor==2.4.0
112 | tiktoken==0.7.0
113 | tokenizers==0.15.2
114 | torch==2.3.0
115 | tqdm==4.66.5
116 | transformers==4.38.2
117 | triton==2.3.0
118 | trl==0.9.6
119 | typing_extensions==4.12.2
120 | tyro==0.8.8
121 | tzdata==2024.1
122 | urllib3==2.2.2
123 | uvicorn==0.30.6
124 | uvloop==0.19.0
125 | vllm==0.4.3
126 | vllm-flash-attn==2.5.8.post2
127 | wandb==0.17.7
128 | watchfiles==0.23.0
129 | websockets==12.0
130 | xformers==0.0.26.post1
131 | xxhash==3.5.0
132 | yarl==1.9.4
133 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/pretrain_data_sampling.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import os
 3 | import glob
 4 | import re
 5 | import math
 6 | import json
 7 | import argparse
 8 | import random
 9 | from tqdm import tqdm
10 | import hashlib
11 | 
12 | def random_sample_benchmark(sample_num,input_dir):
13 |     
14 |     files = sorted(glob.glob(os.path.join(input_dir,"v*"), recursive=True))
15 |     good_dir = os.path.join(input_dir,files[-1],"good")
16 | 
17 |     input_files = sorted(glob.glob(os.path.join(good_dir,"*.jsonl"), recursive=True))
18 |     avg_nums_per_task = math.ceil(sample_num/len(input_files))
19 | 
20 |     sample_1000 = []
21 |     for file in tqdm(input_files,total=len(input_files)):
22 |         filename = os.path.basename(file).replace(".jsonl","")
23 |         data = []
24 |         for line in open(file,"r",encoding='utf-8'):
25 |             try:
26 |                 js = json.loads(line.strip())
27 |             except json.decoder.JSONDecodeError:
28 |                 print(line)
29 |             data.append(js)
30 |         random.shuffle(data)
31 |         print(f"process file {filename}, total of {len(data)}.")
32 |         sample_1000.extend(data[0:avg_nums_per_task])       
33 |     return sample_1000
34 | 
35 | def parse_args():
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument('--dataset_name',
38 |                         type=str,
39 |                         default="SafetyCheck",
40 |                         help='dataset name')
41 |     parser.add_argument('--dataset_path',
42 |                         type=str,
43 |                         default="/data/data_warehouse/llm/clean_data/",
44 |                         help='source path')
45 |     parser.add_argument('--output_path',
46 |                         type=str,
47 |                         default="./",
48 |                         help='source path')
49 | 
50 |     parser.add_argument("--number_sample",
51 |         type=int,
52 |         default=100000,
53 |         help="number of sampled data"
54 |     )
55 |     parser.add_argument('--version',
56 |                         type=str,
57 |                         default="v1",
58 |                         help=""
59 |     )
60 |     args = parser.parse_args()
61 |     return args
62 | 
63 | if __name__ == '__main__':
64 | 
65 |     args = parse_args()
66 |     sample_benchmark = []
67 | 
68 |     subdirs = sorted(glob.glob(os.path.join(args.dataset_path,"cn-*"), recursive=True))
69 |     print(f"{subdirs}")
70 |     for subdir in subdirs:
71 |         dir_ = os.path.join(args.dataset_path,subdir)
72 |         print(f"Processing {dir_}...")
73 |         samples = random_sample_benchmark(
74 |             sample_num=args.number_sample,
75 |             input_dir=dir_
76 |         )
77 |         sample_benchmark.extend(samples)
78 |     print(f"sampling benchmark questions: {len(sample_benchmark)}")
79 | 
80 |     output_file=f"{args.output_path}/{args.dataset_name}-sample-2k.jsonl"
81 |     if os.path.exists(output_file): os.remove(output_file)
82 |     fo = open(output_file, 'w', encoding='utf-8')
83 | 
84 |     random.shuffle(sample_benchmark)
85 |     for idx, item in tqdm(enumerate(sample_benchmark),total=len(sample_benchmark)):
86 |         item["id"] = idx + 1
87 |         jstr = json.dumps(item, ensure_ascii=False)
88 |         fo.write(jstr+"\n")
89 |     fo.close()
90 |     print(f"Output file {output_file}, total sampled {len(samples)}")
91 | 
92 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/search_pretraindata-checkpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import gzip
 4 | import argparse
 5 | import chardet
 6 | from tqdm import tqdm
 7 | from os import listdir, path
 8 | 
 9 | def make_clean(args):
10 |     global_file_no = 0
11 |     global_id_no = 0
12 | 
13 | 
14 |     dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
15 |     if os.path.exists(dest_file): os.remove(dest_file)
16 |     global_file_no += 1
17 |     of = open(dest_file,'w',encoding='utf-8')
18 | 
19 |     subsets = sorted(listdir(args.source_path))
20 |     for dir_no,subset_dir in tqdm(enumerate(subsets),total=len(subsets)):
21 | 
22 |         if subset_dir.find("cn-") == -1: continue
23 | 
24 |         file_dir = os.path.join(args.source_path,subset_dir)
25 |         for root, dirs, files in os.walk(file_dir):
26 |             print('root_dir:', root)
27 |             print('files:', files)
28 |             for file in files:
29 |                 if not file.endswith(".jsonl"):continue
30 |                 input_file = os.path.join(root,file)
31 |                 print("input_file:",input_file)
32 |                 with open(input_file, 'r',encoding='utf-8') as f:
33 |                     for line in f:
34 |                         js_dict = json.loads(line)
35 |                        
36 |                         content = js_dict["content"]
37 |                         if content.find("时代在召唤") == -1 or content.find("长城Assistant") == -1: continue
38 | 
39 |                         if content.find("时代在召唤") >= 0:
40 |                             js_dict["datatype"] = "时代在召唤"
41 |                         elif content.find("长城Assistant") >= 0:
42 |                             js_dict["datatype"] = "长城Assistant"
43 | 
44 |                         print(json.dumps(js_dict,ensure_ascii=False),file=of)
45 |                         if of.tell() > args.max_size:
46 |                             of.close()
47 |                             dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
48 |                             if os.path.exists(dest_file): os.remove(dest_file)
49 |                             of = open(dest_file,'w',encoding='utf-8')
50 |                             global_file_no += 1
51 |     of.close()
52 | 
53 | 
54 | def parse_args():
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument('--source_path',
57 |                         type=str,
58 |                         default="/llm-data-org.del/",
59 |                         help='Directory containing trained actor model')
60 |     parser.add_argument('--dest_path',
61 |                         type=str,
62 |                         default="/localdisk/datacleaner/preprocess/",
63 |                         help='Directory containing trained actor model')
64 |     parser.add_argument('--dataset_name',
65 |                         type=str,
66 |                         default="cn-mnbvc",
67 |                         help="")
68 |     parser.add_argument('--max_size',
69 |                         type=int,
70 |                         default=200 * 1024 * 1024,
71 |                         help="max chunk size")
72 |     args = parser.parse_args()
73 |     return args
74 | 
75 | if __name__ == "__main__":
76 |     args = parse_args()
77 | 
78 |     if not os.path.exists(args.dest_path):
79 |         os.makedirs(args.dest_path, exist_ok=True)
80 |     make_clean(args)
81 | 
82 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/safetycheck_random_sample.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import os
 3 | import glob
 4 | import re
 5 | import math
 6 | import json
 7 | import argparse
 8 | import random
 9 | from tqdm import tqdm
10 | import hashlib
11 | 
12 | def random_sample_benchmark(sample_num,input_dir):
13 |     
14 |     files = sorted(glob.glob(os.path.join(input_dir,"v*"), recursive=True))
15 |     good_dir = os.path.join(input_dir,files[-1],"good")
16 | 
17 |     input_files = sorted(glob.glob(os.path.join(good_dir,"*.jsonl"), recursive=True))
18 |     avg_nums_per_task = math.ceil(sample_num/len(input_files))
19 | 
20 |     sample_1000 = []
21 |     for file in tqdm(input_files,total=len(input_files)):
22 |         filename = os.path.basename(file).replace(".jsonl","")
23 |         data = []
24 |         for line in open(file,"r",encoding='utf-8'):
25 |             try:
26 |                 js = json.loads(line.strip())
27 |             except json.decoder.JSONDecodeError:
28 |                 print(line)
29 |             data.append(js)
30 |         random.shuffle(data)
31 |         print(f"process file {filename}, total of {len(data)}.")
32 |         sample_1000.extend(data[0:avg_nums_per_task])       
33 |     return sample_1000
34 | 
35 | def parse_args():
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument('--dataset_name',
38 |                         type=str,
39 |                         default="SafetyCheck",
40 |                         help='dataset name')
41 |     parser.add_argument('--dataset_path',
42 |                         type=str,
43 |                         default="/data/data_warehouse/llm/clean_data/",
44 |                         help='source path')
45 |     parser.add_argument('--output_path',
46 |                         type=str,
47 |                         default="./",
48 |                         help='source path')
49 | 
50 |     parser.add_argument("--number_sample",
51 |         type=int,
52 |         default=100000,
53 |         help="number of sampled data"
54 |     )
55 |     parser.add_argument('--version',
56 |                         type=str,
57 |                         default="v1",
58 |                         help=""
59 |     )
60 |     args = parser.parse_args()
61 |     return args
62 | 
63 | if __name__ == '__main__':
64 | 
65 |     args = parse_args()
66 |     sample_benchmark = []
67 | 
68 |     subdirs = sorted(glob.glob(os.path.join(args.dataset_path,"cn-*"), recursive=True))
69 |     print(f"{subdirs}")
70 |     for subdir in subdirs:
71 |         dir_ = os.path.join(args.dataset_path,subdir)
72 |         print(f"Processing {dir_}...")
73 |         samples = random_sample_benchmark(
74 |             sample_num=args.number_sample,
75 |             input_dir=dir_
76 |         )
77 |         sample_benchmark.extend(samples)
78 |     print(f"sampling benchmark questions: {len(sample_benchmark)}")
79 | 
80 |     output_file=f"{args.output_path}/{args.dataset_name}-sample-2k.jsonl"
81 |     if os.path.exists(output_file): os.remove(output_file)
82 |     fo = open(output_file, 'w', encoding='utf-8')
83 | 
84 |     random.shuffle(sample_benchmark)
85 |     samples = sample_benchmark[0:2000]
86 | 
87 |     for idx, item in tqdm(enumerate(samples),total=len(samples)):
88 |         item["id"] = idx + 1
89 |         jstr = json.dumps(item, ensure_ascii=False)
90 |         fo.write(jstr+"\n")
91 |     fo.close()
92 |     print(f"Output file {output_file}, total sampled {len(samples)}")
93 | 
94 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/clean_headtails_from_content.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding:utf-8 -*-
 2 | import os
 3 | import sys
 4 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 5 | from flashtext import KeywordProcessor
 6 | from utils.util import load_list_from_structedTxt
 7 | 
 8 | class CleanHeadTailsFromContent:
 9 |     def __init__(self, keyphrase_file, thresh_hold=5):
10 |         self.ads_wechat_flashtext = KeywordProcessor()
11 | 
12 |         ads_phrase_list = load_list_from_structedTxt(keyphrase_file)
13 |         print(f"load {len(ads_phrase_list)} ads phrases:",ads_phrase_list)
14 |         self.ads_wechat_flashtext.add_keywords_from_list(ads_phrase_list)
15 |         self.split_flg = ['。','\n','！','？']
16 |         self.thresh_hold = thresh_hold
17 | 
18 |     def clean(self,text):
19 |         text = text.strip()
20 |         text = self.forward(text)
21 |         #print("forward:",text)
22 |         text = self.backward(text)
23 |         #print("backward:",text)
24 |         return text
25 | 
26 |     def forward(self,text):
27 |         
28 |         prev_density = 0.0
29 |         prev_idx = 0
30 | 
31 |         no_hit_sentence_cnt = 0
32 |         hit_pos = 0
33 | 
34 |         tlen = len(text)
35 |         while prev_idx < tlen:
36 |             curr_idx = prev_idx
37 |             while curr_idx < tlen and text[curr_idx] not in self.split_flg: curr_idx += 1
38 | 
39 |             head_text = text[prev_idx:curr_idx+1]
40 |             if len(head_text) < 1:
41 |                 prev_idx = curr_idx + 1
42 |                 continue
43 |             diff_cnt,keylen = self.calculate_density(head_text)
44 |             #print(f"head_text:{head_text}, diff_cnt:{diff_cnt}, keylen:{keylen}")
45 |             if diff_cnt < 1:
46 |                 prev_idx = curr_idx + 1
47 |                 no_hit_sentence_cnt += 1
48 |                 if no_hit_sentence_cnt >= self.thresh_hold: break
49 |             else:
50 |                 hit_pos = curr_idx + 1
51 |                 prev_idx = curr_idx + 1
52 |                 no_hit_sentence_cnt = 0
53 |         text = text[hit_pos:].strip()
54 |         return text
55 | 
56 |     def backward(self,text):
57 |         last_density = 0.0
58 |         last_idx = len(text) - 1
59 | 
60 |         no_hit_sentence_cnt = 0
61 |         hit_pos = last_idx
62 | 
63 |         while last_idx > 0:
64 |             curr_idx = last_idx
65 |             while curr_idx > 0 and text[curr_idx] not in self.split_flg: curr_idx -= 1
66 | 
67 |             tail_text = text[curr_idx+1:last_idx+1]
68 |             if len(tail_text) < 1:
69 |                 last_idx = curr_idx - 1
70 |                 continue
71 |             diff_cnt,keylen = self.calculate_density(tail_text)
72 |             #print(f"tail_text:{tail_text}, diff_cnt:{diff_cnt}, keylen:{keylen}")
73 |             if diff_cnt < 1:
74 |                 last_idx = curr_idx
75 |                 no_hit_sentence_cnt += 1
76 |                 if no_hit_sentence_cnt >= self.thresh_hold: break
77 |             else:
78 |                 hit_pos = curr_idx
79 |                 last_idx = curr_idx
80 |                 no_hit_sentence_cnt = 0
81 |         text = text[0:hit_pos+1].strip()
82 |         return text
83 | 
84 |     def calculate_density(self,text):
85 |         keyword_list = self.ads_wechat_flashtext.extract_keywords(text)
86 |         #print("keyword_list:",keyword_list)
87 |         keylen = sum([len(item) for item in keyword_list])
88 |         #ratio = 1.0*keylen / (len(text) + 0.005)
89 |         diff_cnt = len(set(keyword_list))
90 |         return diff_cnt,keylen
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/tests/test_best_of_n_sampler.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | from transformers import AutoTokenizer, GenerationConfig
 5 | 
 6 | from trl import AutoModelForCausalLMWithValueHead
 7 | from trl.core import LengthSampler
 8 | from trl.extras import BestOfNSampler
 9 | 
10 | 
11 | def queries_to_scores(list_of_strings):
12 |     return [torch.rand(1).item() for _ in list_of_strings]
13 | 
14 | 
15 | class BestOfNSamplerTester(unittest.TestCase):
16 |     """
17 |     Tests the BestOfNSampler class
18 |     """
19 | 
20 |     ref_model_name = "trl-internal-testing/dummy-GPT2-correct-vocab"
21 |     output_length_sampler = LengthSampler(2, 6)
22 |     model = AutoModelForCausalLMWithValueHead.from_pretrained(ref_model_name)
23 |     tokenizer = AutoTokenizer.from_pretrained(ref_model_name)
24 |     tokenizer.pad_token = tokenizer.eos_token
25 |     output_length_sampler = LengthSampler(2, 6)
26 | 
27 |     def test_different_input_types(self):
28 |         r"""
29 |         Tests if the different input types normalizer works
30 |         """
31 | 
32 |         generation_config = GenerationConfig(
33 |             min_length=-1,
34 |             top_k=0.0,
35 |             top_p=1.0,
36 |             do_sample=True,
37 |             pad_token_id=self.tokenizer.eos_token_id,
38 |         )
39 | 
40 |         output_length_sampler = LengthSampler(2, 6)
41 | 
42 |         best_of_n = BestOfNSampler(
43 |             self.model,
44 |             self.tokenizer,
45 |             queries_to_scores,
46 |             length_sampler=output_length_sampler,
47 |             generation_config=generation_config,
48 |         )
49 | 
50 |         queries = ["hello world", "goodbye world"]
51 |         tokenized_queries = [self.tokenizer.encode(query) for query in queries]
52 | 
53 |         various_queries_formats = [
54 |             (tokenized_queries[0], 1),
55 |             (tokenized_queries, 2),
56 |             (torch.tensor(tokenized_queries[1]), 1),
57 |             ([torch.tensor(query) for query in tokenized_queries], 2),
58 |         ]
59 | 
60 |         for q, expected_length in various_queries_formats:
61 |             results = best_of_n.generate(q)
62 |             self.assertIsInstance(results, list)
63 |             assert len(results) == expected_length
64 | 
65 |     def test_different_sample_sizes_and_n_candidates_values(self):
66 |         r"""
67 |         Tests different sample sizes and n_candidates values
68 |         """
69 |         generation_config = GenerationConfig(
70 |             min_length=-1,
71 |             top_k=0.0,
72 |             top_p=1.0,
73 |             do_sample=True,
74 |             pad_token_id=self.tokenizer.eos_token_id,
75 |         )
76 | 
77 |         output_length_sampler = LengthSampler(6, 10)
78 | 
79 |         for sample_value, n_candidates_values, expected in [
80 |             (4, 2, 2),
81 |             (10, 3, 3),
82 |             (6, 4, 4),
83 |         ]:
84 |             best_of_n = BestOfNSampler(
85 |                 self.model,
86 |                 self.tokenizer,
87 |                 queries_to_scores,
88 |                 length_sampler=output_length_sampler,
89 |                 generation_config=generation_config,
90 |                 sample_size=sample_value,
91 |                 n_candidates=n_candidates_values,
92 |             )
93 | 
94 |             queries = ["hello world", "troll the world"]
95 |             tokenized_queries = [self.tokenizer.encode(query) for query in queries]
96 |             results = best_of_n.generate(tokenized_queries)
97 |             for result in results:
98 |                 assert len(result) == expected
99 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-kindle-checkpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import chardet
 5 | from tqdm import tqdm
 6 | from os import listdir, path
 7 | 
 8 | 
 9 | 
10 | def make_clean(args):
11 |     global_file_no = 0
12 |     global_id_no = 0
13 | 
14 |     subsets = sorted(listdir(args.source_path))
15 |     for dir_no,subset_dir in tqdm(enumerate(subsets),total=len(subsets)):
16 |         
17 |         #subset_dir = subset_dir.replace(" ","\ ")
18 |         file_dir = os.path.join(args.source_path,subset_dir)
19 | 
20 |         dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
21 |         if os.path.exists(dest_file): os.remove(dest_file)
22 |         global_file_no += 1
23 |         of = open(dest_file,'w',encoding='utf-8')
24 |         
25 |         for root, dirs, files in os.walk(file_dir):
26 |             print('root_dir:', root)
27 |             print('files:', files)
28 |        
29 |             #root = root.replace(" ","\ ")
30 |             for file in files:
31 |                 #file = file.replace(" ","\ ")
32 |                 if not (file.endswith(".txt") or file.endswith(".shtml")): continue
33 |                 input_file = os.path.join(root,file)
34 | 
35 |                 html_str = open(input_file, 'rb').read()
36 |                 encoding_info = chardet.detect(html_str)
37 |                 original_encoding = encoding_info['encoding']
38 |                 if original_encoding not in ["UTF-8","GB2312","GB18030","Big5","utf-8","UTF-16","UTF-32"]: continue
39 | 
40 |                 html_str = html_str.decode(original_encoding, 'ignore')#.encode('utf-8')
41 |                 if len(html_str) < 512: continue
42 | 
43 |                 js_dict = {}
44 |                 js_dict["id"] = global_id_no
45 |                 js_dict["source"] = "cn-kindle"
46 |                 js_dict["subset"] = subset_dir
47 |                 js_dict["source_id"] = input_file
48 |                 global_id_no += 1
49 | 
50 |                 js_dict["content"] = html_str
51 | 
52 |                 print(json.dumps(js_dict,ensure_ascii=False),file=of)
53 |                 if of.tell() > args.max_size:
54 |                     of.close()
55 |                     dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
56 |                     if os.path.exists(dest_file): os.remove(dest_file)
57 |                     of = open(dest_file,'w',encoding='utf-8')
58 |                     global_file_no += 1
59 |         of.close()
60 | 
61 | 
62 | def parse_args():
63 |     parser = argparse.ArgumentParser()
64 |     parser.add_argument('--source_path',
65 |                         type=str,
66 |                         default="/data/data_warehouse/llm/source_data/cn-kindle",
67 |                         help='Directory containing trained actor model')
68 |     parser.add_argument('--dest_path',
69 |                         type=str,
70 |                         default="/data/data_warehouse/llm/source_data/cn-kindle2",
71 |                         help='Directory containing trained actor model')
72 |     parser.add_argument('--dataset_name',
73 |                         type=str,
74 |                         default="cn-kindle",
75 |                         help="")
76 |     parser.add_argument('--max_size',
77 |                         type=int,
78 |                         default=500 * 1024 * 1024,
79 |                         help="max chunk size")
80 |     args = parser.parse_args()
81 |     return args
82 | 
83 | if __name__ == "__main__":
84 |     args = parse_args()
85 | 
86 |     if not os.path.exists(args.dest_path):
87 |         os.makedirs(args.dest_path, exist_ok=True)
88 |     make_clean(args)
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/models/auxiliary_modules.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import os
15 | 
16 | import torch
17 | import torch.nn as nn
18 | import torchvision
19 | from huggingface_hub import hf_hub_download
20 | from huggingface_hub.utils import EntryNotFoundError
21 | from transformers import CLIPModel
22 | 
23 | from trl.import_utils import is_npu_available, is_xpu_available
24 | 
25 | 
26 | class MLP(nn.Module):
27 |     def __init__(self):
28 |         super().__init__()
29 |         self.layers = nn.Sequential(
30 |             nn.Linear(768, 1024),
31 |             nn.Dropout(0.2),
32 |             nn.Linear(1024, 128),
33 |             nn.Dropout(0.2),
34 |             nn.Linear(128, 64),
35 |             nn.Dropout(0.1),
36 |             nn.Linear(64, 16),
37 |             nn.Linear(16, 1),
38 |         )
39 | 
40 |     def forward(self, embed):
41 |         return self.layers(embed)
42 | 
43 | 
44 | class AestheticScorer(torch.nn.Module):
45 |     """
46 |     This model attempts to predict the aesthetic score of an image. The aesthetic score
47 |     is a numerical approximation of how much a specific image is liked by humans on average.
48 |     This is from https://github.com/christophschuhmann/improved-aesthetic-predictor
49 |     """
50 | 
51 |     def __init__(self, *, dtype, model_id, model_filename):
52 |         super().__init__()
53 |         self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
54 |         self.normalize = torchvision.transforms.Normalize(
55 |             mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]
56 |         )
57 |         self.target_size = 224
58 |         self.mlp = MLP()
59 |         try:
60 |             cached_path = hf_hub_download(model_id, model_filename)
61 |         except EntryNotFoundError:
62 |             cached_path = os.path.join(model_id, model_filename)
63 |         state_dict = torch.load(cached_path, map_location=torch.device("cpu"))
64 |         self.mlp.load_state_dict(state_dict)
65 |         self.dtype = dtype
66 |         self.eval()
67 | 
68 |     def __call__(self, images):
69 |         device = next(self.parameters()).device
70 |         images = torchvision.transforms.Resize(self.target_size)(images)
71 |         images = self.normalize(images).to(self.dtype).to(device)
72 |         embed = self.clip.get_image_features(pixel_values=images)
73 |         # normalize embedding
74 |         embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True)
75 |         reward = self.mlp(embed).squeeze(1)
76 |         return reward
77 | 
78 | 
79 | def aesthetic_scorer(hub_model_id, model_filename):
80 |     scorer = AestheticScorer(
81 |         model_id=hub_model_id,
82 |         model_filename=model_filename,
83 |         dtype=torch.float32,
84 |     )
85 |     if is_npu_available():
86 |         scorer = scorer.npu()
87 |     elif is_xpu_available():
88 |         scorer = scorer.xpu()
89 |     else:
90 |         scorer = scorer.cuda()
91 | 
92 |     def _fn(images, prompts, metadata):
93 |         images = (images).clamp(0, 1)
94 |         scores = scorer(images)
95 |         return scores, {}
96 | 
97 |     return _fn
98 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-39health-checkpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import gzip
 4 | import argparse
 5 | import chardet
 6 | import sys
 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 8 | from tqdm import tqdm
 9 | from os import listdir, path
10 | from utils.general_policy import GClean
11 | 
12 | _TEXT_LONG_REQUIRED_ = 10
13 | cleaner = GClean(_TEXT_LONG_REQUIRED_)
14 | 
15 | def make_clean(args):
16 |     global_file_no = 0
17 |     global_id_no = 0
18 | 
19 |     jsonlfiles = sorted(listdir(args.source_path))
20 |     for dir_no,subfile in tqdm(enumerate(jsonlfiles),total=len(jsonlfiles)):
21 |        
22 |         dest_file = os.path.join(args.dest_path,"part-39-{:06d}.jsonl".format(global_file_no))
23 |         if os.path.exists(dest_file): os.remove(dest_file)
24 |         global_file_no += 1
25 |         of = open(dest_file,'w',encoding='utf-8')
26 | 
27 |         input_file = os.path.join(args.source_path,subfile)
28 |         print("input_file:",input_file)
29 |         with open(input_file, 'r',encoding='utf-8') as fin:
30 |             for line in tqdm(fin):
31 |                 js_ = json.loads(line)
32 |                 '''
33 |                 {"question": "唐氏筛查afp值结果是0.81----（女24岁）", "answer": "你好，唐氏筛查如果mom值偏高的话，有可能胎儿不正常。建议您进一步做无创DNA的检查。这个是相对比较准确的。唐氏筛查跟很多因素有关系，比如您填写的数值身高体，体重，末次月经。大部分怀孕的胎儿是正常的。怀孕期间每一次的检查都是排除胎儿畸形的。"}
34 |                 '''
35 |                 js_dict = {}
36 |                 js_dict["id"] = global_id_no
37 |                 js_dict["source"] = "cn-medical-treatment"
38 |                 js_dict["subset"] = "39-health"
39 |                 js_dict["source_id"] = ""
40 |                 global_id_no += 1
41 | 
42 |                 ques = js_["question"].strip()
43 |                 if ques[-1] not in ['。','！','？',"?","，",","]:
44 |                     ques = ques + "？"
45 |                 else:
46 |                     ques = ques[0:-1] + "？"
47 |                 answ = js_["answer"].strip()
48 |                 answ = cleaned_content = cleaner.clean_punct_at_begin(answ)
49 |                 js_dict["content"] = ques + answ
50 | 
51 |                 print(json.dumps(js_dict,ensure_ascii=False),file=of)
52 |                 if of.tell() > args.max_size:
53 |                     of.close()
54 |                     dest_file = os.path.join(args.dest_path,"part-39-{:06d}.jsonl".format(global_file_no))
55 |                     if os.path.exists(dest_file): os.remove(dest_file)
56 |                     of = open(dest_file,'w',encoding='utf-8')
57 |                     global_file_no += 1
58 |     of.close()
59 | 
60 | def parse_args():
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument('--source_path',
63 |                         type=str,
64 |                         default="/data/data_warehouse/SourceData/39_health",
65 |                         help='Directory containing trained actor model')
66 |     parser.add_argument('--dest_path',
67 |                         type=str,
68 |                         default="/localdisk/llm/source_data/cn-39-health",
69 |                         help='Directory containing trained actor model')
70 |     parser.add_argument('--dataset_name',
71 |                         type=str,
72 |                         default="cn-cn-39-health",
73 |                         help="")
74 |     parser.add_argument('--max_size',
75 |                         type=int,
76 |                         default=200 * 1024 * 1024,
77 |                         help="max chunk size")
78 |     args = parser.parse_args()
79 |     return args
80 | 
81 | if __name__ == "__main__":
82 |     args = parse_args()
83 | 
84 |     if not os.path.exists(args.dest_path):
85 |         os.makedirs(args.dest_path, exist_ok=True)
86 |     make_clean(args)
87 | 
88 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/orpo_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from dataclasses import dataclass
15 | from typing import Dict, Optional
16 | 
17 | from transformers import TrainingArguments
18 | 
19 | 
20 | @dataclass
21 | class ORPOConfig(TrainingArguments):
22 |     r"""
23 |     ORPOConfig collects all training arguments related to the [`ORPOTrainer`] class.
24 | 
25 |     Using [`HfArgumentParser`] we can turn this class into
26 |     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
27 |     command line.
28 | 
29 |     Parameters:
30 |         max_length (`int`, defaults to `None`):
31 |             The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
32 |         max_prompt_length (`int`, defaults to `None`):
33 |             The maximum length of the prompt. This argument is required if you want to use the default data collator.
34 |         max_completion_length (`int`, defaults to `None`):
35 |             The maximum length of the completions. This argument is required if you want to use the default data collator and your model is an encoder-decoder.
36 |         beta (`float`, defaults to 0.1):
37 |             The beta factor in ORPO loss (lambda/alpha in paper/code) that is the weight of the relative loss ratio in the SFT loss.
38 |         label_pad_token_id (`int`, defaults to `-100`):
39 |             The label pad token id. This argument is required if you want to use the default data collator.
40 |         padding_value (`int`, defaults to `None`):
41 |             The padding value if it is different to the tokenizer's pad_token_id.
42 |         truncation_mode (`str`, defaults to `keep_end`):
43 |             The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator.
44 |         generate_during_eval (`bool`, defaults to `False`):
45 |             Whether to sample and log generations during evaluation step.
46 |         is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
47 |             If no model is provided, we need to know if the model_init returns an encoder-decoder.
48 |         disable_dropout (`bool`, defaults to `True`):
49 |             Whether or not to disable dropouts in `model`.
50 |         model_init_kwargs (`Optional[Dict]`, *optional*):
51 |             Dict of Optional kwargs to pass when instantiating the model from a string
52 |         dataset_num_proc (`Optional[int]`, *optional*):
53 |             The number of workers to use to tokenize the data. Defaults to None.
54 |     """
55 | 
56 |     max_length: Optional[int] = None
57 |     max_prompt_length: Optional[int] = None
58 |     max_completion_length: Optional[int] = None
59 | 
60 |     beta: float = 0.1
61 |     disable_dropout: bool = True
62 | 
63 |     label_pad_token_id: int = -100
64 |     padding_value: int = None
65 |     truncation_mode: str = "keep_end"
66 |     generate_during_eval: bool = False
67 |     is_encoder_decoder: Optional[bool] = None
68 | 
69 |     model_init_kwargs: Optional[Dict] = None
70 | 
71 |     dataset_num_proc: Optional[int] = None
72 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/tests/test_ddpo_trainer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 metric-space, The HuggingFace Team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import gc
 15 | import unittest
 16 | 
 17 | import torch
 18 | 
 19 | from trl import is_diffusers_available
 20 | 
 21 | from .testing_utils import require_diffusers
 22 | 
 23 | 
 24 | if is_diffusers_available():
 25 |     from trl import DDPOConfig, DDPOTrainer, DefaultDDPOStableDiffusionPipeline
 26 | 
 27 | 
 28 | def scorer_function(images, prompts, metadata):
 29 |     return torch.randn(1) * 3.0, {}
 30 | 
 31 | 
 32 | def prompt_function():
 33 |     return ("cabbages", {})
 34 | 
 35 | 
 36 | @require_diffusers
 37 | class DDPOTrainerTester(unittest.TestCase):
 38 |     """
 39 |     Test the DDPOTrainer class.
 40 |     """
 41 | 
 42 |     def setUp(self):
 43 |         self.ddpo_config = DDPOConfig(
 44 |             num_epochs=2,
 45 |             train_gradient_accumulation_steps=1,
 46 |             per_prompt_stat_tracking_buffer_size=32,
 47 |             sample_num_batches_per_epoch=2,
 48 |             sample_batch_size=2,
 49 |             mixed_precision=None,
 50 |             save_freq=1000000,
 51 |         )
 52 |         pretrained_model = "hf-internal-testing/tiny-stable-diffusion-torch"
 53 |         pretrained_revision = "main"
 54 | 
 55 |         pipeline = DefaultDDPOStableDiffusionPipeline(
 56 |             pretrained_model, pretrained_model_revision=pretrained_revision, use_lora=False
 57 |         )
 58 | 
 59 |         self.trainer = DDPOTrainer(self.ddpo_config, scorer_function, prompt_function, pipeline)
 60 | 
 61 |         return super().setUp()
 62 | 
 63 |     def tearDown(self) -> None:
 64 |         gc.collect()
 65 | 
 66 |     def test_loss(self):
 67 |         advantage = torch.tensor([-1.0])
 68 |         clip_range = 0.0001
 69 |         ratio = torch.tensor([1.0])
 70 |         loss = self.trainer.loss(advantage, clip_range, ratio)
 71 |         self.assertEqual(loss.item(), 1.0)
 72 | 
 73 |     def test_generate_samples(self):
 74 |         samples, output_pairs = self.trainer._generate_samples(1, 2)
 75 |         self.assertEqual(len(samples), 1)
 76 |         self.assertEqual(len(output_pairs), 1)
 77 |         self.assertEqual(len(output_pairs[0][0]), 2)
 78 | 
 79 |     def test_calculate_loss(self):
 80 |         samples, _ = self.trainer._generate_samples(1, 2)
 81 |         sample = samples[0]
 82 | 
 83 |         latents = sample["latents"][0, 0].unsqueeze(0)
 84 |         next_latents = sample["next_latents"][0, 0].unsqueeze(0)
 85 |         log_probs = sample["log_probs"][0, 0].unsqueeze(0)
 86 |         timesteps = sample["timesteps"][0, 0].unsqueeze(0)
 87 |         prompt_embeds = sample["prompt_embeds"]
 88 |         advantage = torch.tensor([1.0], device=prompt_embeds.device)
 89 | 
 90 |         self.assertEqual(latents.shape, (1, 4, 64, 64))
 91 |         self.assertEqual(next_latents.shape, (1, 4, 64, 64))
 92 |         self.assertEqual(log_probs.shape, (1,))
 93 |         self.assertEqual(timesteps.shape, (1,))
 94 |         self.assertEqual(prompt_embeds.shape, (2, 77, 32))
 95 |         loss, approx_kl, clipfrac = self.trainer.calculate_loss(
 96 |             latents, timesteps, next_latents, log_probs, advantage, prompt_embeds
 97 |         )
 98 | 
 99 |         self.assertTrue(torch.isfinite(loss.cpu()))
100 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-sina_iask-checkpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import gzip
 4 | import argparse
 5 | import chardet
 6 | import sys
 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 8 | from tqdm import tqdm
 9 | from os import listdir, path
10 | from utils.general_policy import GClean
11 | 
12 | _TEXT_LONG_REQUIRED_ = 10
13 | cleaner = GClean(_TEXT_LONG_REQUIRED_)
14 | 
15 | def make_clean(args):
16 |     global_file_no = 0
17 |     global_id_no = 0
18 | 
19 |     jsonlfiles = sorted(listdir(args.source_path))
20 |     for dir_no,subfile in tqdm(enumerate(jsonlfiles),total=len(jsonlfiles)):
21 |        
22 |         dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
23 |         if os.path.exists(dest_file): os.remove(dest_file)
24 |         global_file_no += 1
25 |         of = open(dest_file,'w',encoding='utf-8')
26 | 
27 |         input_file = os.path.join(args.source_path,subfile)
28 |         print("input_file:",input_file)
29 |         with open(input_file, 'r',encoding='utf-8') as fin:
30 |             for line in tqdm(fin):
31 |                 js_ = json.loads(line)
32 |                 '''
33 |                 {"question": "康宝xdr53-tvc1消毒柜使用方法", "answers": "、使用前认真检查设备运转是否正常，调节器和显示器是否“失控”。2、把洗净、抹净余水的餐具、茶具、食具按平行排列方式倒放或斜放于柜内架层上。3、关好柜门接通电源，扭动起动键。4、扭动“起动”键后，石英管开始发亮，表示消毒工作开始，消 毒结束后，自动切断电源，15分钟后才能打开门取用餐具。", "category": "生活"}
34 |                 {"question": "临期的香水可以买吗", "answers": "最好不要买吧，因为香水这种东西还挺耐用的，不可能快速就用完，有可能过期了也只用了一点点，小毫升的可以买，因为很快消耗掉，所以没关系，特别大的一瓶就没必要买了，买香水最好提前试香，选最喜欢的买，避开不喜欢的味道，没必要追求便宜去买的", "category": "生活"}
35 |                 '''
36 |                 js_dict = {}
37 |                 js_dict["id"] = global_id_no
38 |                 js_dict["source"] = "cn-sina-iask"
39 |                 js_dict["subset"] = js_["category"].strip()
40 |                 js_dict["source_id"] = ""
41 |                 global_id_no += 1
42 | 
43 |                 ques = js_["question"].strip()
44 |                 if ques[-1] not in ['。','！','？',"?","，",","]:
45 |                     ques = ques + "？"
46 |                 else:
47 |                     ques = ques[0:-1] + "？"
48 |                 answ = js_["answers"].strip()
49 |                 answ = cleaned_content = cleaner.clean_punct_at_begin(answ)
50 |                 js_dict["content"] = ques + answ
51 | 
52 |                 print(json.dumps(js_dict,ensure_ascii=False),file=of)
53 |                 if of.tell() > args.max_size:
54 |                     of.close()
55 |                     dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
56 |                     if os.path.exists(dest_file): os.remove(dest_file)
57 |                     of = open(dest_file,'w',encoding='utf-8')
58 |                     global_file_no += 1
59 |     of.close()
60 | 
61 | def parse_args():
62 |     parser = argparse.ArgumentParser()
63 |     parser.add_argument('--source_path',
64 |                         type=str,
65 |                         default="/data/data_warehouse/SourceData/sina_iask",
66 |                         help='Directory containing trained actor model')
67 |     parser.add_argument('--dest_path',
68 |                         type=str,
69 |                         default="/localdisk/llm/source_data/cn-sina-iask",
70 |                         help='Directory containing trained actor model')
71 |     parser.add_argument('--dataset_name',
72 |                         type=str,
73 |                         default="cn-sina-iask",
74 |                         help="")
75 |     parser.add_argument('--max_size',
76 |                         type=int,
77 |                         default=200 * 1024 * 1024,
78 |                         help="max chunk size")
79 |     args = parser.parse_args()
80 |     return args
81 | 
82 | if __name__ == "__main__":
83 |     args = parse_args()
84 | 
85 |     if not os.path.exists(args.dest_path):
86 |         os.makedirs(args.dest_path, exist_ok=True)
87 |     make_clean(args)
88 | 
89 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/mnbvc_prepare-checkpoint.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import re
  3 | import numpy as np
  4 | import json
  5 | import random
  6 | import os
  7 | import hashlib
  8 | 
  9 | 
 10 | '''
 11 | {'fldStatus': 2, 'fldColumnID': 4, 'fldSubject': '蓝点Linux半年上市', 'fldContent': '','fldCreateTime': '2000-04-20 18:03:59', 'fldColumnName': '中国.com', 'fldUserID': 'liuren', 'fldName': '刘韧', 'fldView': 472, 'fldTypeID': '原创-IT', 'fldArticleID': 4, 'fldUserNum': 2}
 12 | '''
 13 | def json2jsonl():
 14 |     with open("./donews.18402.json","r",encoding='utf-8') as fo: data = json.load(fo)
 15 | 
 16 |     sft_data = []
 17 |     idx = 0
 18 |     for idx,item in enumerate(data):
 19 |         js_dict = {}
 20 |         js_dict["id"] = idx + 1
 21 |         js_dict["source"] = "donews"
 22 |         js_dict["subset"] = item["fldSubject"]
 23 |         js_dict["source_id"] = ""
 24 |         js_dict["fldCreateTime"] = item["fldCreateTime"]
 25 |         js_dict["fldTypeID"] = item["fldTypeID"]
 26 |         js_dict["content"] = item["fldContent"]
 27 |         sft_data.append(js_dict)
 28 | 
 29 |     dest_file = os.path.join("./","donews.18402.jsonl")
 30 |     if os.path.exists(dest_file): os.remove(dest_file)
 31 |     of = open(dest_file,'w',encoding='utf-8')
 32 | 
 33 |     #random.shuffle(sft_data)
 34 |     for item in sft_data:
 35 |         print(json.dumps(item,ensure_ascii=False),file=of)
 36 |     of.close()
 37 |     print(f"writting {len(sft_data)} lines into {dest_file}")
 38 | 
 39 | def data2mnbvc_style(input_file,output_dir):
 40 |     '''
 41 |     {
 42 |         '文件名': '文件.txt',
 43 |         '是否待查文件': False,
 44 |         '是否重复文件': False,
 45 |         '文件大小': 1024,
 46 |         'simhash': 0,
 47 |         '最长段落长度': 0,
 48 |         '段落数': 0,
 49 |         '去重段落数': 0,
 50 |         '低质量段落数': 0,
 51 |         '段落': [
 52 |             {
 53 |                 '行号': 1,
 54 |                 '是否重复': False,
 55 |                 '是否跨文件重复': False,
 56 |                 'md5': 'md5hash1',
 57 |                 '内容': '这是第一段文字。'
 58 |             }
 59 |         ]
 60 |     }
 61 |     '''
 62 |     global_file_no = 0
 63 |     dest_file = os.path.join(output_dir,"mnbvc-donews-part-{:06d}.jsonl".format(global_file_no))
 64 |     if os.path.exists(dest_file): os.remove(dest_file)
 65 |     of = open(dest_file,'w',encoding='utf-8')
 66 |    
 67 |     for line in open(input_file,"r",encoding='utf-8'):
 68 |         line = line.strip()
 69 |         if len(line) < 5: continue
 70 |         js_dict = json.loads(line)
 71 | 
 72 |         js_new = {}
 73 |         js_new['文件名'] = js_dict["source"]
 74 |         js_new['是否待查文件'] = False
 75 |         js_new['是否重复文件'] = False
 76 |         js_new['文件大小'] = len(js_dict["content"])
 77 |         js_new['simhash'] = ''
 78 |         js_new['最长段落长度'] = len(js_dict["content"])
 79 |         js_new['段落数'] = 1
 80 |         js_new['去重段落数'] = 0
 81 |         js_new['低质量段落数'] = 0
 82 |         js_new['段落'] = []
 83 | 
 84 |         item = {}
 85 |         item['行号'] = 1
 86 |         item['是否重复'] = False
 87 |         item['是否跨文件重复'] = False
 88 | 
 89 |         content = js_dict["content"]
 90 |         md5 = hashlib.md5(content.encode('utf-8')).hexdigest()
 91 |         item['md5'] = md5
 92 |         item['内容'] = js_dict["content"]
 93 |         js_new['段落'].append(item)
 94 | 
 95 |         print(json.dumps(js_new,ensure_ascii=False),file=of)
 96 |         if of.tell() > 20 * 1024 * 1024:
 97 |             of.close()
 98 |             dest_file = os.path.join(output_dir,"mnbvc-donews-part-{:06d}.jsonl".format(global_file_no))
 99 |             if os.path.exists(dest_file): os.remove(dest_file)
100 |             of = open(dest_file,'w',encoding='utf-8')
101 |             global_file_no += 1
102 | 
103 |     of.close()
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     input_file = "../llm/clean_data/cn-donews/v1/good/donews.18402.jsonl" 
108 |     output_file = "../llm/clean_data/cn-donews/v1/good/"
109 |     data2mnbvc_style(input_file,output_file)
110 | 
111 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/sft_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from dataclasses import dataclass
15 | from typing import Dict, Optional
16 | 
17 | from transformers import TrainingArguments
18 | 
19 | 
20 | @dataclass
21 | class SFTConfig(TrainingArguments):
22 |     r"""
23 |     Initialize SFTConfig.
24 | 
25 |     Args:
26 |         dataset_text_field (`Optional[str]`):
27 |             The name of the text field of the dataset, in case this is passed by a user, the trainer will automatically create a
28 |             `ConstantLengthDataset` based on the `dataset_text_field` argument. Defaults to None.
29 |         packing (`Optional[bool]`):
30 |             Used only in case `dataset_text_field` is passed. This argument is used by the `ConstantLengthDataset` to pack the sequences
31 |             of the dataset. Defaults to False.
32 |         max_seq_length (`Optional[int]`):
33 |             The maximum sequence length to use for the `ConstantLengthDataset` and for automatically creating the Dataset. Defaults to min of the smaller of the `tokenizer.model_max_length` and `1024`.
34 |         dataset_num_proc (`Optional[int]`):
35 |             The number of workers to use to tokenize the data. Only used when `packing=False`. Defaults to None.
36 |         dataset_batch_size (`int`):
37 |             The number of examples to tokenize per batch. If batch_size <= 0 or batch_size == None,
38 |             tokenize the full dataset as a single batch. Defaults to 1000.
39 |         neftune_noise_alpha (`Optional[float]`):
40 |             If not `None`, this will activate NEFTune noise embeddings. This has been proven to drastically improve model performances for instruction
41 |             fine-tuning. Check out the original paper here: https://huggingface.co/papers/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune
42 |         model_init_kwargs: (`Optional[Dict]`, *optional*):
43 |             Dict of Optional kwargs to pass when instantiating the model from a string.
44 |         dataset_kwargs: (`Optional[Dict]`, *optional*):
45 |             Dict of Optional kwargs to pass when creating packed or non-packed datasets
46 |         eval_packing: (`Optional[bool]`, *optional*):
47 |             Whether to pack the eval dataset as well. Defaults to `packing` if `None` is passed.
48 |         num_of_sequences (`Optional[int]`):
49 |             The number of sequences to use for the `ConstantLengthDataset`. Defaults to `1024`.
50 |         chars_per_token (`Optional[float]`):
51 |             The number of characters per token to use for the `ConstantLengthDataset`. Defaults to `3.6`. You can check how this is computed in the
52 |             stack-llama example:
53 |             [chars_token_ratio](https://github.com/huggingface/trl/blob/08f550674c553c36c51d1027613c29f14f3676a5/examples/stack_llama/scripts/supervised_finetuning.py#L53).
54 |     """
55 | 
56 |     dataset_text_field: Optional[str] = None
57 |     packing: Optional[bool] = False
58 |     max_seq_length: Optional[int] = None
59 |     dataset_num_proc: Optional[int] = None
60 |     dataset_batch_size: int = 1000
61 |     neftune_noise_alpha: Optional[float] = None
62 |     model_init_kwargs: Optional[Dict] = None
63 |     dataset_kwargs: Optional[Dict] = None
64 |     eval_packing: Optional[bool] = None
65 |     num_of_sequences: Optional[int] = 1024
66 |     chars_per_token: Optional[float] = 3.6
67 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/extras/dataset_formatting.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Callable, Literal, Optional, Union
 3 | 
 4 | from datasets import Dataset, Value
 5 | from transformers import AutoTokenizer
 6 | 
 7 | from ..trainer.utils import ConstantLengthDataset
 8 | 
 9 | 
10 | FORMAT_MAPPING = {
11 |     "chatml": [{"content": Value(dtype="string", id=None), "role": Value(dtype="string", id=None)}],
12 |     "instruction": {"completion": Value(dtype="string", id=None), "prompt": Value(dtype="string", id=None)},
13 | }
14 | 
15 | 
16 | def conversations_formatting_function(tokenizer: AutoTokenizer, messages_field: Literal["messages", "conversations"]):
17 |     r"""
18 |     return a callable function that takes in a "messages" dataset and returns a formatted dataset, based on the tokenizer
19 |     apply chat template to the dataset
20 |     """
21 | 
22 |     def format_dataset(examples):
23 |         if isinstance(examples[messages_field][0], list):
24 |             output_texts = []
25 |             for i in range(len(examples[messages_field])):
26 |                 output_texts.append(tokenizer.apply_chat_template(examples[messages_field][i], tokenize=False))
27 |             return output_texts
28 |         else:
29 |             return tokenizer.apply_chat_template(examples[messages_field], tokenize=False)
30 | 
31 |     return format_dataset
32 | 
33 | 
34 | def instructions_formatting_function(tokenizer: AutoTokenizer):
35 |     r"""
36 |     return a callable function that takes in an "instructions" dataset and returns a formatted dataset, based on the tokenizer
37 |     apply chat template to the dataset
38 |     """
39 | 
40 |     def format_dataset(examples):
41 |         if isinstance(examples["prompt"], list):
42 |             output_texts = []
43 |             for i in range(len(examples["prompt"])):
44 |                 converted_sample = [
45 |                     {"role": "user", "content": examples["prompt"][i]},
46 |                     {"role": "assistant", "content": examples["completion"][i]},
47 |                 ]
48 |                 output_texts.append(tokenizer.apply_chat_template(converted_sample, tokenize=False))
49 |             return output_texts
50 |         else:
51 |             converted_sample = [
52 |                 {"role": "user", "content": examples["prompt"]},
53 |                 {"role": "assistant", "content": examples["completion"]},
54 |             ]
55 |             return tokenizer.apply_chat_template(converted_sample, tokenize=False)
56 | 
57 |     return format_dataset
58 | 
59 | 
60 | def get_formatting_func_from_dataset(
61 |     dataset: Union[Dataset, ConstantLengthDataset], tokenizer: AutoTokenizer
62 | ) -> Optional[Callable]:
63 |     r"""
64 |     Finds the correct formatting function based on the dataset structure. Currently supported datasets are:
65 |     - `ChatML` with [{"role": str, "content": str}]
66 |     - `instruction` with [{"prompt": str, "completion": str}]
67 | 
68 |     Args:
69 |         dataset (Dataset): User dataset
70 |         tokenizer (AutoTokenizer): Tokenizer used for formatting
71 | 
72 |     Returns:
73 |         Callable: Formatting function if the dataset format is supported else None
74 |     """
75 |     if isinstance(dataset, Dataset):
76 |         if "messages" in dataset.features:
77 |             if dataset.features["messages"] == FORMAT_MAPPING["chatml"]:
78 |                 logging.info("Formatting dataset with chatml format")
79 |                 return conversations_formatting_function(tokenizer, "messages")
80 |         if "conversations" in dataset.features:
81 |             if dataset.features["conversations"] == FORMAT_MAPPING["chatml"]:
82 |                 logging.info("Formatting dataset with chatml format")
83 |                 return conversations_formatting_function(tokenizer, "conversations")
84 |         elif dataset.features == FORMAT_MAPPING["instruction"]:
85 |             logging.info("Formatting dataset with instruction format")
86 |             return instructions_formatting_function(tokenizer)
87 | 
88 |     return None
89 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/model_config.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import List, Optional
  3 | 
  4 | from ..core import flatten_dict
  5 | 
  6 | 
  7 | @dataclass
  8 | class ModelConfig:
  9 |     """
 10 |     Arguments which define the model and tokenizer to load.
 11 |     """
 12 | 
 13 |     model_name_or_path: Optional[str] = field(
 14 |         default=None,
 15 |         metadata={"help": ("The model checkpoint for weights initialization.")},
 16 |     )
 17 |     model_revision: str = field(
 18 |         default="main",
 19 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
 20 |     )
 21 |     torch_dtype: Optional[str] = field(
 22 |         default=None,
 23 |         metadata={
 24 |             "help": (
 25 |                 "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
 26 |                 "dtype will be automatically derived from the model's weights."
 27 |             ),
 28 |             "choices": ["auto", "bfloat16", "float16", "float32"],
 29 |         },
 30 |     )
 31 |     trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."})
 32 |     attn_implementation: Optional[str] = field(
 33 |         default=None,
 34 |         metadata={
 35 |             "help": (
 36 |                 "Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`"
 37 |             )
 38 |         },
 39 |     )
 40 |     use_peft: bool = field(
 41 |         default=False,
 42 |         metadata={"help": ("Whether to use PEFT or not for training.")},
 43 |     )
 44 |     lora_r: Optional[int] = field(
 45 |         default=16,
 46 |         metadata={"help": ("LoRA R value.")},
 47 |     )
 48 |     lora_alpha: Optional[int] = field(
 49 |         default=32,
 50 |         metadata={"help": ("LoRA alpha.")},
 51 |     )
 52 |     lora_dropout: Optional[float] = field(
 53 |         default=0.05,
 54 |         metadata={"help": ("LoRA dropout.")},
 55 |     )
 56 |     lora_target_modules: Optional[List[str]] = field(
 57 |         default=None,
 58 |         metadata={"help": ("LoRA target modules.")},
 59 |     )
 60 |     lora_modules_to_save: Optional[List[str]] = field(
 61 |         default=None,
 62 |         metadata={"help": ("Model layers to unfreeze & train")},
 63 |     )
 64 |     lora_task_type: str = field(
 65 |         default="CAUSAL_LM", metadata={"help": "The task_type to pass for LoRA (use SEQ_CLS for reward modeling)"}
 66 |     )
 67 |     use_rslora: bool = field(
 68 |         default=False,
 69 |         metadata={
 70 |             "help": (
 71 |                 "Use Rank-Stabilized LoRA (https://huggingface.co/papers/2312.03732), which sets the adapter "
 72 |                 "scaling factor to lora_alpha/√r, instead of the original default value of `lora_alpha/r`."
 73 |             )
 74 |         },
 75 |     )
 76 |     load_in_8bit: bool = field(
 77 |         default=False, metadata={"help": "use 8 bit precision for the base model - works only with LoRA"}
 78 |     )
 79 |     load_in_4bit: bool = field(
 80 |         default=False, metadata={"help": "use 4 bit precision for the base model - works only with LoRA"}
 81 |     )
 82 | 
 83 |     bnb_4bit_quant_type: Optional[str] = field(
 84 |         default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"}
 85 |     )
 86 |     use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"})
 87 | 
 88 |     def to_dict(self):
 89 |         output_dict = {}
 90 |         for key, value in self.__dict__.items():
 91 |             output_dict[key] = value
 92 |         return flatten_dict(output_dict)
 93 | 
 94 |     def __post_init__(self):
 95 |         if self.load_in_8bit and self.load_in_4bit:
 96 |             raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
 97 | 
 98 |         if isinstance(self.lora_target_modules, list) and len(self.lora_target_modules) == 1:
 99 |             self.lora_target_modules = self.lora_target_modules[0]
100 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/examples/research_projects/tools/calculator.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import re
 17 | 
 18 | import numpy as np
 19 | import torch
 20 | from transformers import AutoTokenizer, load_tool
 21 | 
 22 | from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, TextEnvironment
 23 | 
 24 | 
 25 | def generate_data(n):
 26 |     """Generate random arithmetic tasks and answers."""
 27 |     tasks, answers = [], []
 28 |     for _ in range(n):
 29 |         a = np.random.randint(0, 50)
 30 |         b = np.random.randint(0, 50)
 31 |         op = np.random.choice(["-", "+", "*"])
 32 |         tasks.append(f"\n\nWhat is {a} {op} {b}?")
 33 |         if op == "-":
 34 |             answers.append(a - b)
 35 |         elif op == "+":
 36 |             answers.append(a + b)
 37 |         else:
 38 |             answers.append(a * b)
 39 |     return tasks, answers
 40 | 
 41 | 
 42 | def exact_match_reward(responses, answers=None):
 43 |     """Reward if generated response contains correct answer."""
 44 |     rewards = []
 45 |     pattern = r"Result\s*=\s*(-?\d+(?:\.\d+)?)\s*<submit>"  # generated by chatGPT
 46 |     for response, answer in zip(responses, answers):
 47 |         reward = 0.0
 48 |         predicted_number = None
 49 |         match_pattern = re.findall(pattern, response)
 50 |         if match_pattern:
 51 |             predicted_number = float(match_pattern[0])
 52 |         if predicted_number is not None:
 53 |             if np.abs(predicted_number - answer) < 0.01:
 54 |                 reward += 1.0
 55 |         rewards.append(torch.tensor(reward))
 56 |     return rewards
 57 | 
 58 | 
 59 | # set up models
 60 | model_id = "gpt2"
 61 | model = AutoModelForCausalLMWithValueHead.from_pretrained(model_id)
 62 | model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(model_id)
 63 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 64 | tokenizer.pad_token = tokenizer.eos_token
 65 | 
 66 | # system prompt
 67 | prompt = """\
 68 | What is 13-3?
 69 | 
 70 | <request><SimpleCalculatorTool>13-3<call>10.0<response>
 71 | 
 72 | Result=10<submit>
 73 | 
 74 | What is 4*3?
 75 | 
 76 | <request><SimpleCalculatorTool>4*3<call>12.0<response>
 77 | 
 78 | Result=12<submit>"""
 79 | 
 80 | generation_kwargs = {
 81 |     "min_length": -1,
 82 |     "top_k": 0.0,
 83 |     "top_p": 1.0,
 84 |     "do_sample": True,
 85 |     "pad_token_id": tokenizer.eos_token_id,
 86 |     "eos_token_id": -1,
 87 |     "max_new_tokens": 32,
 88 | }
 89 | 
 90 | # trainer
 91 | ppo_config = PPOConfig(
 92 |     batch_size=256,
 93 |     learning_rate=1.41e-5,
 94 |     mini_batch_size=64,
 95 |     log_with="wandb",
 96 | )
 97 | ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)
 98 | 
 99 | # text env
100 | text_env = TextEnvironment(
101 |     model,
102 |     tokenizer,
103 |     {"SimpleCalculatorTool": load_tool("ybelkada/simple-calculator")},
104 |     exact_match_reward,
105 |     prompt,
106 |     generation_kwargs=generation_kwargs,
107 | )
108 | 
109 | # main training loop
110 | for step in range(100):
111 |     tasks, answers = generate_data(ppo_config.batch_size)
112 |     queries, responses, masks, rewards, histories = text_env.run(tasks, answers=answers)
113 |     train_stats = ppo_trainer.step(queries, responses, rewards, masks)
114 | 
115 |     response_texts = [tokenizer.decode(response) for response in responses]
116 |     query_texts = [tokenizer.decode(query) for query in queries]
117 |     texts = {"query": [qt.split("<submit>")[-1].strip() for qt in query_texts], "response": response_texts}
118 |     ppo_trainer.log_stats(train_stats, texts, rewards, columns_to_log=["query", "response", "answer"])
119 | ppo_trainer.save_pretrained(model_id + "-calculator")
120 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/preprocess/.ipynb_checkpoints/preprocess_cn-mnbvc-checkpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import gzip
 4 | import argparse
 5 | import chardet
 6 | from tqdm import tqdm
 7 | from os import listdir, path
 8 | 
 9 | def make_clean(args):
10 |     global_file_no = 0
11 |     global_id_no = 0
12 | 
13 |     subsets = sorted(listdir(args.source_path))
14 |     for dir_no,subset_dir in tqdm(enumerate(subsets),total=len(subsets)):
15 |        
16 |         if subset_dir not in ["gov","law","news","qa"]: continue
17 | 
18 |         file_dir = os.path.join(args.source_path,subset_dir)
19 | 
20 |         dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
21 |         if os.path.exists(dest_file): os.remove(dest_file)
22 |         global_file_no += 1
23 |         of = open(dest_file,'w',encoding='utf-8')
24 |         
25 |         for root, dirs, files in os.walk(file_dir):
26 |             print('root_dir:', root)
27 |             print('files:', files)
28 |             for file in files:
29 |                 if not file.endswith(".jsonl.gz"):continue
30 |                 input_file = os.path.join(root,file)
31 |                 print("input_file:",input_file)
32 |                 with gzip.open(input_file, 'rt') as f:
33 |                     for line in f:
34 |                         js_ = json.loads(line)
35 |                         
36 |                         js_dict = {}
37 |                         js_dict["id"] = global_id_no
38 |                         js_dict["source"] = "cn-mnbvc"
39 |                         js_dict["subset"] = subset_dir
40 |                         js_dict["source_id"] = file
41 |                         global_id_no += 1
42 |                         
43 |                         if subset_dir in ["gov"]:
44 |                             if "文件名" in js_:
45 |                                 js_dict["source_id"] = js_["文件名"]
46 |                                 js_dict["content"] = '\n'.join([item["内容"] for item in js_["段落"]])
47 |                             else:
48 |                                 js_dict["source_id"] = eval(js_["meta"])["文件名"]
49 |                                 js_dict["content"] = js_["text"]
50 |                         elif subset_dir in ["law"]:
51 |                             js_dict["source_id"] = js_["分卷名"]
52 |                             js_dict["content"] = js_["详情"]
53 |                         elif subset_dir in ["news"]:
54 |                             js_dict["source_id"] = os.path.basename(js_["文件名"])
55 |                             js_dict["content"] = '\n'.join([item["内容"] for item in js_["段落"]])
56 |                         elif subset_dir in ["qa"]:
57 |                             js_dict["source_id"] = js_["来源"]
58 |                             js_dict["content"] = js_["问"]+"\n"+js_["答"]
59 | 
60 |                         print(json.dumps(js_dict,ensure_ascii=False),file=of)
61 |                         if of.tell() > args.max_size:
62 |                             of.close()
63 |                             dest_file = os.path.join(args.dest_path,"part-{:06d}.jsonl".format(global_file_no))
64 |                             if os.path.exists(dest_file): os.remove(dest_file)
65 |                             of = open(dest_file,'w',encoding='utf-8')
66 |                             global_file_no += 1
67 |     of.close()
68 | 
69 | 
70 | def parse_args():
71 |     parser = argparse.ArgumentParser()
72 |     parser.add_argument('--source_path',
73 |                         type=str,
74 |                         default="/data/data_warehouse/llm/source_data/cn-mnbvc",
75 |                         help='Directory containing trained actor model')
76 |     parser.add_argument('--dest_path',
77 |                         type=str,
78 |                         default="/data/data_warehouse/llm/source_data/cn-mnbvc2",
79 |                         help='Directory containing trained actor model')
80 |     parser.add_argument('--dataset_name',
81 |                         type=str,
82 |                         default="cn-mnbvc",
83 |                         help="")
84 |     parser.add_argument('--max_size',
85 |                         type=int,
86 |                         default=200 * 1024 * 1024,
87 |                         help="max chunk size")
88 |     args = parser.parse_args()
89 |     return args
90 | 
91 | if __name__ == "__main__":
92 |     args = parse_args()
93 | 
94 |     if not os.path.exists(args.dest_path):
95 |         os.makedirs(args.dest_path, exist_ok=True)
96 |     make_clean(args)
97 | 
98 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/tests/test_data_collator_completion_only.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import unittest
15 | 
16 | import torch
17 | from transformers import AutoTokenizer
18 | 
19 | from trl import DataCollatorForCompletionOnlyLM
20 | 
21 | 
22 | class DataCollatorForCompletionOnlyLMTester(unittest.TestCase):
23 |     def test_data_collator_finds_response_template_llama2_tokenizer(self):
24 |         # this should ideally be tested with meta-llama/Llama-2-7b-hf
25 |         self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/dummy-GPT2-correct-vocab")
26 |         self.instruction = """### System: You are a helpful assistant.
27 | 
28 | ### User: How much is 2+2?
29 | 
30 | ### Assistant: 2+2 equals 4"""
31 |         self.instruction_template = "\n### User:"
32 |         self.response_template = "\n### Assistant:"
33 | 
34 |         # GPT2Tokenizer: [198, 21017, 11787, 25] -> [11787, 25]
35 |         # Llama2Tokenizer: [29871, 13, 2277, 29937, 4911, 29901] -> [2277, 29937, 4911, 29901]
36 |         self.tokenized_instruction_w_context = self.tokenizer.encode(
37 |             self.instruction_template, add_special_tokens=False
38 |         )[2:]
39 | 
40 |         # GPT2Tokenizer: [198, 21017, 15286, 25] -> [15286, 25]
41 |         # Llama2Tokenizer: [29871, 13, 2277, 29937, 4007, 22137, 29901] -> [2277, 29937, 4007, 22137, 29901]
42 |         self.tokenized_response_w_context = self.tokenizer.encode(self.response_template, add_special_tokens=False)[2:]
43 | 
44 |         # Plain check on string
45 |         self.assertIn(self.response_template, self.instruction)
46 |         self.tokenized_instruction = self.tokenizer.encode(self.instruction, add_special_tokens=False)
47 | 
48 |         # Test the fix for #598
49 |         # Pass already tokenized (w context) and truncated response_template so token_ids are like in the instruction + response
50 |         self.collator = DataCollatorForCompletionOnlyLM(self.tokenized_response_w_context, tokenizer=self.tokenizer)
51 |         self.collator.torch_call([self.tokenized_instruction])
52 | 
53 |         # Test for PR #749
54 |         # Pass already tokenized (w context) instruction and response both so token_ids are like in the instruction + response
55 |         self.collator = DataCollatorForCompletionOnlyLM(
56 |             self.tokenized_response_w_context, self.tokenized_instruction_w_context, tokenizer=self.tokenizer
57 |         )
58 |         self.collator.torch_call([self.tokenized_instruction])
59 | 
60 |     def test_data_collator_handling_of_long_sequences(self):
61 |         self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/dummy-GPT2-correct-vocab")
62 |         self.instruction = """### System: You are a helpful assistant.
63 | 
64 | ### User: How much is 2+2? I'm asking because I'm not sure. And I'm not sure because I'm not good at math.
65 | """
66 |         self.response_template = "\n### Assistant:"
67 |         # check DataCollatorForCompletionOnlyLM using response template only
68 |         self.tokenized_instruction = self.tokenizer.encode(self.instruction, add_special_tokens=False)
69 |         self.collator = DataCollatorForCompletionOnlyLM(self.response_template, tokenizer=self.tokenizer)
70 |         encoded_instance = self.collator.torch_call([self.tokenized_instruction])
71 |         result = torch.all(encoded_instance["labels"] == -100)
72 |         self.assertTrue(result, "Not all values in the tensor are -100.")
73 | 
74 |         # check DataCollatorForCompletionOnlyLM using response template and instruction template
75 |         self.instruction_template = "\n### User:"
76 |         self.collator = DataCollatorForCompletionOnlyLM(
77 |             self.response_template, self.instruction_template, tokenizer=self.tokenizer
78 |         )
79 |         encoded_instance = self.collator.torch_call([self.tokenized_instruction])
80 |         result = torch.all(encoded_instance["labels"] == -100)
81 |         self.assertTrue(result, "Not all values in the tensor are -100.")
82 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/tests/test_iterative_sft_trainer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import tempfile
 15 | import unittest
 16 | 
 17 | import torch
 18 | from datasets import Dataset
 19 | from parameterized import parameterized
 20 | from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments
 21 | 
 22 | from trl import IterativeSFTTrainer
 23 | 
 24 | 
 25 | class IterativeTrainerTester(unittest.TestCase):
 26 |     @classmethod
 27 |     def setUpClass(cls):
 28 |         cls.model_id = "trl-internal-testing/dummy-GPT2-correct-vocab"
 29 |         cls.model = AutoModelForCausalLM.from_pretrained(cls.model_id)
 30 |         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_id)
 31 |         cls.tokenizer.pad_token = cls.tokenizer.eos_token
 32 | 
 33 |         # get t5 as seq2seq example:
 34 |         model_id = "trl-internal-testing/tiny-T5ForConditionalGeneration-correct-vocab"
 35 |         cls.t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
 36 |         cls.t5_tokenizer = AutoTokenizer.from_pretrained(model_id)
 37 | 
 38 |     def _init_tensor_dummy_dataset(self):
 39 |         dummy_dataset_dict = {
 40 |             "input_ids": [torch.tensor([5303, 3621]), torch.tensor([3666, 1438, 318]), torch.tensor([5303, 3621])],
 41 |             "attention_mask": [torch.tensor([1, 1]), torch.tensor([1, 1, 1]), torch.tensor([1, 1])],
 42 |             "labels": [torch.tensor([5303, 3621]), torch.tensor([3666, 1438, 318]), torch.tensor([5303, 3621])],
 43 |         }
 44 | 
 45 |         dummy_dataset = Dataset.from_dict(dummy_dataset_dict)
 46 |         dummy_dataset.set_format("torch")
 47 |         return dummy_dataset
 48 | 
 49 |     def _init_textual_dummy_dataset(self):
 50 |         dummy_dataset_dict = {
 51 |             "texts": ["Testing the IterativeSFTTrainer.", "This is a test of the IterativeSFTTrainer"],
 52 |             "texts_labels": ["Testing the IterativeSFTTrainer.", "This is a test of the IterativeSFTTrainer"],
 53 |         }
 54 | 
 55 |         dummy_dataset = Dataset.from_dict(dummy_dataset_dict)
 56 |         dummy_dataset.set_format("torch")
 57 |         return dummy_dataset
 58 | 
 59 |     def setUp(self):
 60 |         # initialize trainer
 61 |         self.model.train()
 62 |         return super().setUp()
 63 | 
 64 |     @parameterized.expand(
 65 |         [
 66 |             ["gpt2", "tensor"],
 67 |             ["gpt2", "text"],
 68 |             ["t5", "tensor"],
 69 |             ["t5", "text"],
 70 |         ]
 71 |     )
 72 |     def test_iterative_step_from_tensor(self, model_name, input_name):
 73 |         with tempfile.TemporaryDirectory() as tmp_dir:
 74 |             # initialize dataset
 75 |             if input_name == "tensor":
 76 |                 dummy_dataset = self._init_tensor_dummy_dataset()
 77 |                 inputs = {
 78 |                     "input_ids": dummy_dataset["input_ids"],
 79 |                     "attention_mask": dummy_dataset["attention_mask"],
 80 |                     "labels": dummy_dataset["labels"],
 81 |                 }
 82 |             else:
 83 |                 dummy_dataset = self._init_textual_dummy_dataset()
 84 |                 inputs = {
 85 |                     "texts": dummy_dataset["texts"],
 86 |                     "texts_labels": dummy_dataset["texts_labels"],
 87 |                 }
 88 | 
 89 |             if model_name == "gpt2":
 90 |                 model = self.model
 91 |                 tokenizer = self.tokenizer
 92 |             else:
 93 |                 model = self.t5_model
 94 |                 tokenizer = self.t5_tokenizer
 95 | 
 96 |             args = TrainingArguments(
 97 |                 output_dir=tmp_dir,
 98 |                 per_device_train_batch_size=2,
 99 |                 max_steps=2,
100 |             )
101 |             iterative_trainer = IterativeSFTTrainer(model=model, args=args, tokenizer=tokenizer)
102 | 
103 |             iterative_trainer.step(**inputs)
104 | 
105 |             for param in iterative_trainer.model.parameters():
106 |                 assert param.grad is not None
107 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/alignprop_config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import warnings
  4 | from dataclasses import dataclass, field
  5 | from typing import Literal, Optional
  6 | 
  7 | from ..core import flatten_dict
  8 | from ..import_utils import is_bitsandbytes_available, is_torchvision_available
  9 | 
 10 | 
 11 | @dataclass
 12 | class AlignPropConfig:
 13 |     """
 14 |     Configuration class for AlignPropTrainer
 15 |     """
 16 | 
 17 |     # common parameters
 18 |     exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")]
 19 |     """the name of this experiment (by default is the file name without the extension name)"""
 20 |     run_name: Optional[str] = ""
 21 |     """Run name for wandb logging and checkpoint saving."""
 22 |     seed: int = 0
 23 |     """Seed value for random generations"""
 24 |     log_with: Optional[Literal["wandb", "tensorboard"]] = None
 25 |     """Log with either 'wandb' or 'tensorboard', check  https://huggingface.co/docs/accelerate/usage_guides/tracking for more details"""
 26 |     log_image_freq = 1
 27 |     """Logging Frequency for images"""
 28 |     tracker_kwargs: dict = field(default_factory=dict)
 29 |     """Keyword arguments for the tracker (e.g. wandb_project)"""
 30 |     accelerator_kwargs: dict = field(default_factory=dict)
 31 |     """Keyword arguments for the accelerator"""
 32 |     project_kwargs: dict = field(default_factory=dict)
 33 |     """Keyword arguments for the accelerator project config (e.g. `logging_dir`)"""
 34 |     tracker_project_name: str = "trl"
 35 |     """Name of project to use for tracking"""
 36 |     logdir: str = "logs"
 37 |     """Top-level logging directory for checkpoint saving."""
 38 | 
 39 |     # hyperparameters
 40 |     num_epochs: int = 100
 41 |     """Number of epochs to train."""
 42 |     save_freq: int = 1
 43 |     """Number of epochs between saving model checkpoints."""
 44 |     num_checkpoint_limit: int = 5
 45 |     """Number of checkpoints to keep before overwriting old ones."""
 46 |     mixed_precision: str = "fp16"
 47 |     """Mixed precision training."""
 48 |     allow_tf32: bool = True
 49 |     """Allow tf32 on Ampere GPUs."""
 50 |     resume_from: Optional[str] = ""
 51 |     """Resume training from a checkpoint."""
 52 |     sample_num_steps: int = 50
 53 |     """Number of sampler inference steps."""
 54 |     sample_eta: float = 1.0
 55 |     """Eta parameter for the DDIM sampler."""
 56 |     sample_guidance_scale: float = 5.0
 57 |     """Classifier-free guidance weight."""
 58 |     train_batch_size: int = 1
 59 |     """Batch size (per GPU!) to use for training."""
 60 |     train_use_8bit_adam: bool = False
 61 |     """Whether to use the 8bit Adam optimizer from bitsandbytes."""
 62 |     train_learning_rate: float = 1e-3
 63 |     """Learning rate."""
 64 |     train_adam_beta1: float = 0.9
 65 |     """Adam beta1."""
 66 |     train_adam_beta2: float = 0.999
 67 |     """Adam beta2."""
 68 |     train_adam_weight_decay: float = 1e-4
 69 |     """Adam weight decay."""
 70 |     train_adam_epsilon: float = 1e-8
 71 |     """Adam epsilon."""
 72 |     train_gradient_accumulation_steps: int = 1
 73 |     """Number of gradient accumulation steps."""
 74 |     train_max_grad_norm: float = 1.0
 75 |     """Maximum gradient norm for gradient clipping."""
 76 |     negative_prompts: Optional[str] = ""
 77 |     """Comma-separated list of prompts to use as negative examples."""
 78 |     truncated_backprop_rand: bool = True
 79 |     """Truncated Randomized Backpropation randomizes truncation to different diffusion timesteps"""
 80 |     truncated_backprop_timestep: int = 49
 81 |     """Absolute timestep to which the gradients are being backpropagated. If truncated_backprop_rand is False"""
 82 |     truncated_rand_backprop_minmax: tuple = (0, 50)
 83 |     """Range of diffusion timesteps for randomized truncated backprop."""
 84 | 
 85 |     def to_dict(self):
 86 |         output_dict = {}
 87 |         for key, value in self.__dict__.items():
 88 |             output_dict[key] = value
 89 |         return flatten_dict(output_dict)
 90 | 
 91 |     def __post_init__(self):
 92 |         if self.log_with not in ["wandb", "tensorboard"]:
 93 |             warnings.warn(
 94 |                 "Accelerator tracking only supports image logging if `log_with` is set to 'wandb' or 'tensorboard'."
 95 |             )
 96 | 
 97 |         if self.log_with == "wandb" and not is_torchvision_available():
 98 |             warnings.warn("Wandb image logging requires torchvision to be installed")
 99 | 
100 |         if self.train_use_8bit_adam and not is_bitsandbytes_available():
101 |             raise ImportError(
102 |                 "You need to install bitsandbytes to use 8bit Adam. "
103 |                 "You can install it with `pip install bitsandbytes`."
104 |             )
105 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/trl/trainer/cpo_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from dataclasses import dataclass
15 | from typing import Dict, Literal, Optional
16 | 
17 | from transformers import TrainingArguments
18 | 
19 | 
20 | @dataclass
21 | class CPOConfig(TrainingArguments):
22 |     r"""
23 |     CPOConfig collects all training arguments related to the [`CPOTrainer`] class.
24 | 
25 |     Using [`HfArgumentParser`] we can turn this class into
26 |     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
27 |     command line.
28 | 
29 |     Parameters:
30 |         max_length (`int`, defaults to `None`):
31 |             The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
32 |         max_prompt_length (`int`, defaults to `None`):
33 |             The maximum length of the prompt. This argument is required if you want to use the default data collator.
34 |         max_target_length (`int`, defaults to `None`):
35 |             The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder.
36 |         beta (`float`, defaults to 0.1):
37 |             The beta factor in CPO loss.
38 |         label_smoothing (`float`, defaults to 0):
39 |             The label smoothing factor. This argument is required if you want to use the default data collator.
40 |         loss_type (`str`, defaults to `sigmoid`):
41 |             The type of loss to use. This argument is required if you want to use the default data collator.
42 |         label_pad_token_id (`int`, defaults to `-100`):
43 |             The label pad token id. This argument is required if you want to use the default data collator.
44 |         cpo_alpha (`float`, defaults to `1.0`):
45 |             A hyperparameter that controls the strength of the BC regularizer in CPO training.
46 |         simpo_gamma (`float`, defaults to `0.5`):
47 |             A target reward margin for the SimPO loss, used only when the "simpo" option is enabled.
48 |         padding_value (`int`, defaults to `None`):
49 |             The padding value if it is different to the tokenizer's pad_token_id.
50 |         truncation_mode (`str`, defaults to `keep_end`):
51 |             The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator.
52 |         generate_during_eval (`bool`, defaults to `False`):
53 |             Whether to sample and log generations during evaluation step.
54 |         is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
55 |             If no model is provided, we need to know if the model_init returns an encoder-decoder.
56 |         disable_dropout (`bool`, defaults to `True`):
57 |             Whether or not to disable dropouts in `model`.
58 |         model_init_kwargs (`Optional[Dict]`, *optional*):
59 |             Dict of Optional kwargs to pass when instantiating the model from a string
60 |         dataset_num_proc (`Optional[int]`, *optional*):
61 |             The number of workers to use to tokenize the data. Defaults to None.
62 |     """
63 | 
64 |     max_length: Optional[int] = None
65 |     max_prompt_length: Optional[int] = None
66 |     max_completion_length: Optional[int] = None
67 |     max_target_length: Optional[int] = None
68 | 
69 |     beta: float = 0.1
70 |     label_smoothing: float = 0
71 |     loss_type: Literal["sigmoid", "hinge", "ipo", "simpo"] = "sigmoid"
72 |     disable_dropout: bool = True
73 |     cpo_alpha: float = 1.0
74 |     simpo_gamma: float = 0.5
75 | 
76 |     label_pad_token_id: int = -100
77 |     padding_value: int = None
78 |     truncation_mode: str = "keep_end"
79 |     generate_during_eval: bool = False
80 |     is_encoder_decoder: Optional[bool] = None
81 | 
82 |     model_init_kwargs: Optional[Dict] = None
83 | 
84 |     dataset_num_proc: Optional[int] = None
85 | 
86 |     def __post_init__(self):
87 |         if self.loss_type == "kto_pair":
88 |             raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.")
89 |         return super().__post_init__()
90 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/test60.py:
--------------------------------------------------------------------------------
 1 | from general_policy import GClean
 2 | 
 3 | cleaner = GClean(50)
 4 | 
 5 | sentence = '018骞存湯锛岃?琛屾牳      蹇冧竴绾祫鏈?厖瓒崇巼銆佷竴绾祫鏈?厖瓒崇巼鍙婅祫鏈?厖瓒崇巼鍒嗗埆涓?.54%锛?.39%鍙?1.50%锛屽潎婊冻鐩戠?杈炬爣瑕佹眰锛岃      緝涓婂勾鏈?.26銆?.21鍙?.30涓?櫨鍒嗙偣銆偂XIIa href=\"\" target=\"_blank\">聚彩网大发快3骗局：榛勬鼎涓?巻浠讳腑鍥介摱琛      屼笟鍗忎細鍏氬?濮斿憳銆佺?涔暱锛屽浗鏈夐噸鐐归噾铻嶆満鏋?浗杩涘嚭鍙i摱琛?鐩戜簨浼氬壇灞绾T笓鑱岀洃浜嬨佸贰瑙嗗憳绛夎亴      鍔備粬鎷湁鏀垮簻銆佺洃绠満鏋勩佽?涓氱浼氱粍缁囩瓑澶氫釜棰嗗煙鐨勫伐浣滅粡鍘嗭紝鐔熸倝閾惰?涓氱洃绠斂绛栵紝闀挎湡鍏虫敞      閲戣瀺绉戞妧棰嗗煙銆侞欢乐彩大发快3怎么下载鍘讳簡鍖婚櫌鍚庯紝鎴戝啀涔熸病鍥炲幓杩囬偅涓?瘯闀滅殑鎴块棿銆傜劧鍚庯紝鎴戝      氨鍥炲?浜嗐偂XII/span>。浣曠珛宄扮O锛屽湪鍒涘缓鍥介檯绉戝?腑蹇冭繖鏂归潰锛屽姏搴?杩涗竴姝姞澶c傚湪纭?鏂归潰锛岄?拰娣卞      湷闈犺繎鐨勬渤濂楀湴鍖猴紝澶T綋涓?.89骞虫柟鍏?鍔犲揩瑙勫垝锛屼富瑕佹槸娣卞湷鏂归潰閰嶅悎棣欐腐鏂归潰锛屽姞蹇?鍒掞紝鏃>      舵満鏉欢鎴愮啛浠悗鎺繘寤鸿?銆傜浠惰?鏂藉缓璁剧殑鍐嶄竴涓?柟闈一氨鏄?湪骞垮窞銆佹繁鍦冲埌棣欐腐锛屽箍宸炪佺彔娴峰埌婢抽      棬锛屽缓璁句袱鏉浗闄呮按骞崇殑绉戞妧鍒涙柊璧板粖锛岄櫎姝箣澶栵紝杩樿?闄一垱鍔為?垱鏂扮爺绌堕櫌锛岃繕鏈夊叾浠栨柟闈三殑      涓浜涙秹鍙婂埌绉戞妧鍒涙柊鏂归潰鐨勯噸澶T妇鎺?紝瑕佺户缁?鍔涙帹杩涳紝涓夊湴瀵嗗垏閰嶅悎锛屽皢浼氫骇鐢?+1+1杩滆繙澶T簬3      鐨勬晥鏋溿偂XII/span>鍦汉鎵嶆湇鍔柟闈?紝浣滀负鍥藉?棣栨壒娴峰?楂樺眰娆汉鎵嶅垱鏂板垱涓氬熀鍦帮紝缁忓紑鍖哄湪鍏浗棣栧垱      鈥滀笂绠佷笅绠皬鈥濆叏閾炬潯浜烘墠鏈嶅姟妯紡锛岀洰鍓嶅尯鍐呰仛闆嗗悇绫婚珮灞傛?浜烘墠鎬绘暟浣嶅眳骞垮窞甯傚悇鍖虹?涓銆      佸箍涓滅渷鍓嶅垪锛屽尯鍩熷紩鎵嶈仛鎵嶅憟鐜板浗闄呭寲銆佺郴缁熷寲鐨勬佸娍銆傛帴涓嬫潵锛岀粡寮鍖哄皢杩涗竴姝繁鍖栤滀笂绠      佷笅绠皬鈥濈殑鍏摼鏉汉鎵嶆湇鍔a寮忥紝鎵撻氫汉鎵嶅垱鏂板垱涓氥佸眳浣忋佺敓娲荤殑鈥滄渶鍚庝竴鍏?'
 6 | 
 7 | sentence2 = '浴室玻璃隔断+固定杆","content":"看了好多小红书的浴室隔断都好好看就也想要一个无边框的，但是沟通出错了原来想要的是靠墙用卡扣的结果不知道怎么变成这种卡槽了，不过也还可以。横杆是玻璃厂老板来装的时候说我玻璃太大了不安全，卡槽返工的时候自己又拿了几种玻璃固定杆让我选非给我装的（不要钱），不然只固定墙上一面地上打胶就会很晃。这种和三角形固定的我还是选了这个，好歹还能挂点东西。装之前老板问了好几遍你家有没有小孩，要是有小孩子无边框的这个不可以。我这边没有小孩子，有小孩也不会用这个卫生间所以完全没问题。买的时候销售都不会说，不会说家里有小孩不要选无边框，玻璃大需要固定之类的。'
 8 | 
 9 | sentence3 = '广州峰帆贸易有限公司      是一家集礼品策划、设计、开发、生产、销售、服务于一体的专业礼品企业。公司经营的产品包括：广州商务礼品、周年庆礼品、年会      礼品、节日礼品、促销礼品、广告宣传礼品、积分兑换礼品、员工福利礼品、特色礼品等系列产品。有礼品方面需求的广大客户，我们      将免费为您提供礼品策划，设计方案，以优质,全面的服务于广大客户， 欢迎您与我们联系！广州峰帆贸易有公司秉着\"务实进取\"的      企业宗旨，以诚信为本，开拓创新的精神，以追求卓越品质，提供优质服务的理念，通过现代化的管理打 造了优秀团队，为客户提供>      卓越的产品和优质的服务是我们不断追求的目标。我们以市场和客户的需求为导向，紧跟国际国内潮流，精选国 内外优质厂商达成战>      略联盟，成为专业礼品团购代理经销体系，为客户提供更多时尚、新奇、特色、实用并物美价优的礼品。上一篇： 广州迪欣贸易有限>      公司下一篇：湖南醴陵红官窑瓷业有限公司 成功签约广东省物流行业协会纯定制网站服务合作！  恭喜八爪鱼网络与省电信工程（成>      立于1950年）签约合作！  恭喜八爪鱼网络与美视晶莹（银幕行业世界前二）签约合作  恭喜八爪鱼网络与中标数据（证券代码:87070      8）签约合作！  恭喜八爪鱼网络与华南理工大学成功签约网站建设 做网站 建网站 小程序开发 网站制作 企业网站建设 广州网站建>      设 网站 有限公司 广州 成功案例 客户 广东 广州市 第一次 服务好 品牌 生物科技 事务所 家具 官网 律师 首饰 集团 核心 后台       代码 珠宝首饰 鑫诺 旅游观光 国际 服装纺织 公司 电气 设备 八爪鱼网络 美容化妆 装饰工程 空间设计 准备工作 前期 广告创意       钟表 美斯 鱼网 安防 装饰设计 有保障 网站设计 尼曼 餐饮管理 汽配 电子 吸引 工程 深圳市 教育培训 金融投资 酒店管理 生物>      医药 汽车 电力 物流运输 家居 电子电器 节能环保 金融 乐享 服装 汉光 医药 建材 机械设备 房地产业 食品 技术 旅行社 祖诺 >      餐饮 品牌策划 礼品盒 制造厂 很省心 优网站 精艺 玛雅 通讯 厦门 网页 彩印 智能 展览设计 建筑工程 设计公司 法律师 新闻 上      海互联网 企业管理 网络技术 艺术 携程 服饰 设计师 包满意 商城 值得过 浙江 银饰品 曼古 集团公司 红谷 伊顿 前端 格兰 幸福      西饼 互联网+ 百益 制衣厂 中山 大象 程序员 信得过 德马吉 力天 全案策划 朗昇 麦睿仕 四川 营销策划 广告设计 服务，服务 向      日葵 经理 知识产权 代表 中新 大陆 软银 实体商业 购物商城 手机 外贸 网络营销 企业 八鱼网 广告公司 阿里 华为 全球最具品>      牌价值百强 时间 趋势 捷达 优派 官窑 瓷业 企业邮箱 服务器 体育 空间租用 域名注册 平台 商家 日用品 皮具 超音速 电缆 消防      设备 德科 仪器 芬尼 东莞市 精密机械 实验设备 新材料 医疗 东津 尔曼 天使 珠宝 米莱 养堂 产业 科方 生物技术 鼎科 宝莎曼       首饰珠宝 数控设备 机具 可卡 成都 信息 恒爱 照明设备 轻工业 广东省 菲达 技工学校 中国留学生 我们能提供什么 夸克 家私 木      业 金融服务 兴隆 食品工业 研究所 陈记 投资管理 科创 极至 创意 凤凰 中国 企业顾问 第一次做网站、对网站不了解？ 旧网站改      版、对网站有初步认识了？ 我应该做个什么网站？您可以填写右边的表格，让我们了解您的项目需求，这是一个良好的开始，我们将>      会尽快与你取得联系。也欢迎您给我们打电话，让我们马上进行沟通吧！'
10 | 
11 | sentence4 = '山东省菏泽市中级人民法      院刑 事 裁 定 书（2017）鲁17刑更967号罪犯袁德朋，男，汉族，1980年1月2日出生于山东省曹县。现在山东省菏泽监狱服刑。>      二〇一五年二月十日，本院作出(2014)菏刑一初字第11号刑事附带民事判决，以被告人袁德朋犯故意伤害罪，判处有期徒刑十五年      。宣判后，被告人不服提出上诉。二一五年六月十一日，山东省高级人民法院作出(2015)鲁刑三终字第44号刑事附带民事裁定，驳      回上诉，维持原判。宣判后交付执行。执行机关山东省菏泽监狱于2017年11月22日提出减刑建议书，报送本院审理。本院依法组成      合议庭进行了审理。本案现已审理终结。执行机关山东省菏泽监狱,以罪犯袁德朋在服刑期间能认罪悔罪；认真遵守法律法规及监>      规；接受教育改造；积极参加思想、文化、职业技术教育；积极参加劳动，努力完成劳动任务，确有悔改表现等为由，提出予以减      刑建议。并附罪犯袁德朋在服刑期间的表现、奖励记录等书证。经审理查明，罪犯袁德朋在山东省菏泽监狱服刑改造期间认罪悔罪      ；认真遵守法律法规及监规；接受教育改造；积极参加思想、文化、职业技术教育；积极参加劳动，努力完成劳动任务。曾获表扬      四次；被评为2016年度监区级罪犯改造积极分子。另查明，该犯在服刑期间主动履行民事赔偿20873.7元。上述事实，有罪犯奖励>      审批表、计分考核明细表、领款条等证据予以证实。本院认为，罪犯袁德朋在服刑期间确有悔改表现，符合减刑条件。并结合其犯      罪的性质、具体情节、社会危害程度、原判刑罚及财产性判项的履行情况及交付执行后的一贯表现等因素，依照《中华人民共和国      刑事诉讼法》第二百六十二条第二款，《中华人民共和国刑法》第七十九条、第七十八条之规定，裁定如下：对罪犯袁德朋减去有      期徒刑八个月的刑罚执行。（刑期自2013年7月30日起至2027年11月29日止）本裁定送达后即发生法律效力。'
12 | 
13 | sentence5 = '吉林省蛟河市人民法院执       行 裁 定 书(2020)吉0281执402号被执行人：马波，男，汉族，37岁。被执行人马波罚金执行一案，本院作出的（2020）吉0281>      刑初78号刑事判决书，主文如下：一、被告人马波犯诈骗罪，判处有期徒刑二年二个月，并处罚金人民币二万元。（刑期从判决执      行之日起计算。判决执行前先行羁押的，羁押一日折抵刑期一日，即自2018年2月15日起至2020年4月14日止。罚金限于本判决生效      后三十日内缴纳。）二、追缴被告人违法所得34560元，返还被害人。该判决书已经发生法律效力，本院于2020年5月6日立案执行>      ，要求被执行人马波履行生效法律文书中确定的义务。本院在执行过程中，分别对被执行人马波采取了如下措施：对被执行人马波      通过判决书中所确认的地址向被执行人送达了执行通知书、报告财产令、限制消费令等法律手续，对被执行人在金融部门的存款进      行了调查，未发现存款。通过查询被执行人名下的机动车辆，无车辆登记信息。通过对被执行人不动产调查，被执行人名下无不动      产登记信息。被执行人马波手机号码已为空号，同时新冠病毒疫情爆发，蛟河地区疫情严重，暂时无法下乡寻找。鉴于被执行人马      波没有主动对民事裁判涉财产部分履行，故本院已向被执行人发出限制消费令，对其今后的行为予以限制。本院认为：在本院穷尽      执行措施后，暂未发现被执行人有可供执行的财产，可以认定被执行人暂不具备履行生效法律文书确定的法律义务的能力。本院已      向申请执行人告知上诉执行情况。依照《最高人民法院关于适用的解释》第五百一十九条、最高人民法院《关于严格规范终结本次      执行程序的规定（试行）》第七条之规定，裁定如下：终结本次执行程序。终结本次执行程序后，待发现被执行人有可供执行财产      的，可以向本院申请恢复执行。再次申请不受申请执行时效期间的限制。本裁定送达后即发生法律效力。如不服本裁定，可自本裁      定书送达之日起十日内，向本院提出书面执行异议。'
14 | 
15 | print(cleaner.common_zhLessThan20(sentence4))
16 | print(cleaner.common_zhLessThan20(sentence5))
17 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/tokenizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import os
  3 | import glob
  4 | import json
  5 | import argparse
  6 | from tqdm import tqdm
  7 | import multiprocessing as mp
  8 | import transformers
  9 | 
 10 | tokenizer_kwargs = {
 11 |     "use_fast": True,
 12 |     "revision": "xbGPT"
 13 | }
 14 | tokenizer_path="/mnt/public/open_source_AI/Meta-Llama-3.1-8B-Instruct"
 15 | tokenizer = transformers.AutoTokenizer.from_pretrained("/mnt/public/open_source_AI/Meta-Llama-3.1-8B-Instruct")
 16 | tokenizer.pad_token = tokenizer.eos_token
 17 | 
 18 | def jobj2count(jobj):
 19 |     """
 20 |         mp process controller
 21 |     """
 22 |     for itm in tqdm(jobj):
 23 |         yield itm
 24 | 
 25 | def process_file(js):
 26 |     global tokenizer
 27 |     num_tokens = 0
 28 |     text = ' '.join(js['data']).strip()
 29 |     tokens = tokenizer.encode(text,add_special_tokens=False)#(js['content'])
 30 |     num_tokens += len(tokens) 
 31 |     return {'num_tokens': num_tokens, "score": float(js['score'])}
 32 | 
 33 | 
 34 | def llama_tokenizer(args):
 35 |     input_dir = args.dataset_path
 36 |     src_files = sorted(glob.glob(os.path.join(input_dir, "*.jsonl"), recursive=True))
 37 |     print(f"src_files: {src_files}")
 38 | 
 39 |     pool = mp.Pool(args.num_workers)
 40 |     total_tokens = 0
 41 | 
 42 |     records = {}
 43 |     records["files"] = []
 44 | 
 45 |     for idx,xfile in tqdm(enumerate(src_files),total=len(src_files)):
 46 | 
 47 |         tokens = 0
 48 |         difficulty = 0.0
 49 |         filename = os.path.basename(xfile)#.replace(".jsonl","")
 50 |         print(f"process file: {filename}")
 51 |         
 52 |         with open(xfile,"r",encoding='utf-8') as fin:
 53 |             line_content = [json.loads(line) for line in fin.readlines()]
 54 |             for res in pool.imap(process_file, jobj2count(line_content)):
 55 |                 tokens += res['num_tokens']
 56 |                 difficulty += res['score']
 57 | 
 58 |             print(f'file {filename} has {tokens} tokens, {difficulty} difficulty scores.')
 59 |             records["files"].append(
 60 |                 {
 61 |                     "filename":filename,
 62 |                     "llama_tokens":tokens,
 63 |                     "difficulty_scores":difficulty,
 64 |                     "total_samples":len(line_content),
 65 |                     "avg_tokens_per_sample":1.0*tokens/len(line_content),
 66 |                     "avg_difficulty_score_per_sample":1.0*difficulty/len(line_content),
 67 |                 }
 68 |             )
 69 |         total_tokens += tokens
 70 |     records["total_llama_tokens"] = total_tokens
 71 |     return records
 72 | 
 73 | def parse_args():
 74 |     parser = argparse.ArgumentParser()
 75 |     parser.add_argument('--dataset_name',
 76 |                         type=str,
 77 |                         default="jdItem",
 78 |                         help='dataset name')
 79 |     parser.add_argument('--dataset_path',
 80 |                         type=str,
 81 |                         default="/data_warehouse/llm/source_data/JDItem_pattern_dataset/SampledRawDataset/",
 82 |                         help='source path')
 83 |     parser.add_argument('--output_path',
 84 |                         type=str,
 85 |                         default="/data_warehouse/llm/source_data/JDItem_pattern_dataset/",
 86 |                         help='source path')
 87 | 
 88 |     parser.add_argument('--tokenizer_path',
 89 |                         type=str,
 90 |                         default="/xxxx/chinese_llama_13b_plus84",
 91 |                         help="tokenizer path, default LLaMA tokenizer")
 92 |     parser.add_argument('--version',
 93 |                         type=str,
 94 |                         default="v1",
 95 |                         help=""
 96 |     )
 97 |     parser.add_argument('--num_workers',
 98 |                         type=int,
 99 |                         default=32,
100 |                         help="")
101 |     args = parser.parse_args()
102 |     return args
103 | 
104 | if __name__ == '__main__':
105 | 
106 |     tokenizer_kwargs = {
107 |         "use_fast": True,
108 |         "revision": "productGPT"
109 |     }
110 | 
111 |     args = parse_args()
112 |     records = {}
113 | 
114 |     #tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path, **tokenizer_kwargs)
115 |     #tokenizer.pad_token = tokenizer.eos_token
116 |     print(f"num of llama tokens: {tokenizer.vocab_size}")
117 | 
118 |     records = llama_tokenizer(args)
119 |     records['dataset'] = args.dataset_name
120 | 
121 |     output_file = os.path.join(args.output_path,"{}-meta-info-{}.json".format(args.dataset_name,args.version))
122 |     if os.path.exists(output_file): os.remove(output_file)
123 |     with open(output_file, 'w') as f:
124 |         json.dump(records, f, indent=4)
125 | 
126 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/preprocess/preprocess_cn-wechat.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import json
  4 | #import jieba_fast as jieba
  5 | #import gzip
  6 | import argparse
  7 | #import chardet
  8 | from tqdm import tqdm
  9 | from os import listdir, path
 10 | 
 11 | def get_head_tail_sentence(args):
 12 |     global_file_no = 0
 13 |     global_id_no = 0
 14 | 
 15 |     dest_file = os.path.join(args.dest_path,"wechat_content_sentences.txt")
 16 |     if os.path.exists(dest_file): os.remove(dest_file)
 17 |     of = open(dest_file,'w',encoding='utf-8')
 18 | 
 19 |     subsets = sorted(listdir(args.source_path))
 20 |     for dir_no,file_name in tqdm(enumerate(subsets),total=len(subsets)):
 21 |        
 22 |         input_file = os.path.join(args.source_path,file_name)
 23 |         with open(input_file, 'r',encoding='utf-8') as f:
 24 |             for line in f:
 25 |                 line = line.strip()
 26 |                 if len(line) < 100:continue
 27 |                 js_dict = json.loads(line)
 28 |                 content = js_dict["content"].strip()
 29 |                 if len(content) < 100: continue
 30 |                 
 31 |                 '''
 32 |                 split_flg = [',',';','。',',','；','。','！','？',' ','\n','\t']
 33 |                 
 34 |                 fpos = 1
 35 |                 while fpos < len(content) and content[fpos] not in split_flg: fpos += 1
 36 |                 head = content[0:fpos]
 37 |                 
 38 |                 lpos = len(content) - 1 -1
 39 |                 while lpos > 0 and content[lpos] not in split_flg: lpos -= 1
 40 |                 tail = content[lpos+1:]
 41 |                 '''
 42 |                 head = content[50:len(content)-50]
 43 |                 #if len(head) > args.topk: head = head[:args.topk]
 44 |                 #if len(tail) > args.topk: tail = tail[-args.topk:]
 45 |                 print(head,file=of)
 46 |                 #if tail != head: print(tail,file=of)
 47 |     of.close()
 48 | 
 49 | def text_segment(args):
 50 |     # /root/llm/source_data/wechat_head_tail_sentences.txt
 51 |     dest_file = os.path.join("/root/llm/source_data/","wechat_head_tail_sentences_segment.txt")
 52 |     if os.path.exists(dest_file): os.remove(dest_file)
 53 |     of = open(dest_file,'w',encoding='utf-8')
 54 | 
 55 |     with open("/root/llm/source_data/wechat_head_tail_sentences.txt", 'r',encoding='utf-8') as f:
 56 |         for line in f:
 57 |             line = line.strip()
 58 |             if len(line) < 3:continue
 59 |             seg_list = jieba.cut(line,cut_all=False)
 60 |             text = ' '.join([item for item in seg_list if len(item) > 1])
 61 |             print(text,file=of)
 62 |     of.close()
 63 | 
 64 | def extract_keyphrase(args):
 65 |     keyphrse_dict = dict()
 66 | 
 67 |     idx = 0
 68 |     with open("/root/llm/source_data/phrases.txt",'r') as f:
 69 |         for line in tqdm(f):
 70 |             line = line.strip()
 71 |             if len(line) < 1:continue
 72 |             tokens = line.split("\t")
 73 |             if len(tokens) != 3: 
 74 |                 print("tokens:",tokens)
 75 |                 continue
 76 |             phrase = tokens[1].replace("_","")
 77 |             if phrase not in keyphrse_dict:
 78 |                 keyphrse_dict[phrase] = [1,tokens[2]]
 79 |             else:
 80 |                 keyphrse_dict[phrase][0] = keyphrse_dict[phrase][0] + 1
 81 |             idx += 1
 82 |             #if idx > 50000: break
 83 |     #
 84 |     keyphrse_list = sorted(keyphrse_dict.items(), key = lambda kv:(kv[1], kv[0]),reverse = True)
 85 |     for item in keyphrse_list:
 86 |         # ('眼下正是', [1, '102.464'])
 87 |         freq = item[1][0]
 88 |         muinfo = item[1][1]
 89 |         phrase = item[0]
 90 |         #if freq < 100: continue
 91 |         print(f"{phrase}\t{freq}\t{muinfo}")
 92 | 
 93 | def parse_args():
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument('--source_path',
 96 |                         type=str,
 97 |                         default="/data/data_warehouse/llm/source_data/cn-wechat",
 98 |                         help='Directory containing trained actor model')
 99 |     parser.add_argument('--dest_path',
100 |                         type=str,
101 |                         default="/root/llm/source_data/",
102 |                         help='Directory containing trained actor model')
103 |     parser.add_argument('--dataset_name',
104 |                         type=str,
105 |                         default="cn-wechat",
106 |                         help="")
107 |     parser.add_argument('--topk',
108 |                         type=int,
109 |                         default=20,
110 |                         help="max chunk size")
111 |     args = parser.parse_args()
112 |     return args
113 | 
114 | if __name__ == "__main__":
115 |     args = parse_args()
116 | 
117 |     if not os.path.exists(args.dest_path):
118 |         os.makedirs(args.dest_path, exist_ok=True)
119 |     #get_head_tail_sentence(args)
120 |     #text_segment(args)
121 |     extract_keyphrase(args)
122 | 
123 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/special_policy.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding:utf-8 -*-
  2 | import os
  3 | import re
  4 | import sys
  5 | import numpy as np
  6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  7 | 
  8 | class SpecialPolicies():
  9 |     def __init__(self,):
 10 |         pass
 11 | 
 12 |     @staticmethod
 13 |     def IsChatperText(text,threashold=10,thresh_ratio=0.25):
 14 |         
 15 |         if len(text) < 1: return False
 16 | 
 17 |         first_num = len(re.findall(r'第[0-9一二三四五六七八九十百千万壹贰叁肆伍陆柒捌玖拾佰仟]+章', text))
 18 |         second_num = len(re.findall(r'（[0-9一二三四五六七八九十百千万壹贰叁肆伍陆柒捌玖拾佰仟]+）', text))
 19 |         
 20 |         if first_num > 10 or second_num > 10: return True
 21 | 
 22 |         first_num = 0
 23 |         second_num = 0
 24 |         for item in re.findall(r'第[0-9一二三四五六七八九十百千万壹贰叁肆伍陆柒捌玖拾佰仟]+章', text):
 25 |             first_num += len(item)
 26 |         for item in re.findall(r'（[0-9一二三四五六七八九十百千万壹贰叁肆伍陆柒捌玖拾佰仟]+）', text):
 27 |             second_num += len(item)
 28 | 
 29 |         frist_ratio = 1.0*first_num / len(text)
 30 |         second_ratio = 1.0*second_num / len(text)
 31 |         
 32 |         if frist_ratio > thresh_ratio or second_ratio > thresh_ratio: return True
 33 | 
 34 |         return False
 35 | 
 36 |     @staticmethod
 37 |     def RemoveReference(text):
 38 |         # 这种行为不合情理pp=66–67, 70。
 39 |         # 开创了塞萨洛尼基王国pp=62–63。
 40 | 
 41 |         #text = re.sub(r"p{1,2}=\d+–*\d*,*\s*\d*–*\d*","",text)
 42 |         regex = re.compile(r"p{1,2}=(\d+–*\d*,*\s*)+")
 43 |         text = regex.sub("",text)
 44 |         return text
 45 | 
 46 |     @staticmethod
 47 |     def RemoveLastLineBreak(text):
 48 |         text = text.strip().strip("\n").strip()
 49 |         return text
 50 | 
 51 |     @staticmethod
 52 |     def RemoveHeadWords(text):
 53 |         head_words = ["概述","图片发自简书app","[转载]"]
 54 |         for item in head_words:
 55 |             text = text.lstrip(item)
 56 |         text = text.strip()
 57 |         return text
 58 |  
 59 |     
 60 |     @staticmethod
 61 |     def RemoveSpamFromContent(text,spam):
 62 |         regex = re.compile(spam)
 63 |         text = regex.sub("",text)
 64 |         text = text.strip()
 65 |         return text
 66 | 
 67 |     @staticmethod
 68 |     def RemoveAllReference(text):
 69 |         # 参考文献:
 70 |         regex = re.compile(r"参考文献[:：].*")
 71 |         text = regex.sub("",text)
 72 |         text = text.strip()
 73 |         return text
 74 | 
 75 |     @staticmethod
 76 |     def delete_like_collect_comment(sentence):
 77 |         '''
 78 |         For ods_zdm_detail, match and remove 点赞 收藏 评论
 79 |         '''
 80 |         like = re.compile(r'\d*点赞')
 81 |         collect = re.compile('\d*收藏')
 82 |         comment = re.compile(r'\d*评论')
 83 |         sent = like.sub('', sentence)
 84 |         sent = collect.sub('', sent)
 85 |         sent = comment.sub('', sent)
 86 |         return sent
 87 | 
 88 |     @staticmethod
 89 |     def delete_author_claim(sentence):
 90 |         '''
 91 |         For ods_zdm_detail, match and remove 作者声明xxxx
 92 |         '''
 93 |         pattern = re.compile(r'作者声明.*|本文商品由什么.*|小编注.*|以上是.*分享.*|全文完.*|(感谢|谢谢).*(众测|测评|机会|值友).*|我是.*|(链接|商品链接).*?(去购买|去看看)|未经授权，不得转载.*|本文[^。]*.$|\|赞\d.*|The.{0,1}End.*')
 94 |         return pattern.sub('', sentence)
 95 | 
 96 |     @staticmethod
 97 |     def detect_lottery(sentence):
 98 |         '''
 99 |         For ods_zdm 
100 |         '''
101 |         pattern = re.compile(r'(获奖|有奖).*活动')
102 |         if pattern.search(sentence):
103 |             return False
104 |         else:
105 |             return sentence
106 | 
107 |     # 2023-08-16
108 |     @staticmethod
109 |     def RemovewechatID(text):
110 |         # 参考文献:
111 |         regex = re.compile(r"微信.{0,5}[a-zA-Z_][-_a-zA-Z0-9]{5,19}")
112 |         text = regex.sub("",text)
113 |         text = text.strip()
114 |         return text        
115 |     @staticmethod
116 |     def RemoveAllUnicode(text):
117 |         # Unicode 编码 like <200a> <200b>:
118 |         regex = re.compile(r"<[0-f]{4}>")
119 |         text = regex.sub("",text)
120 |         text = text.strip()
121 |         return text
122 |     
123 |     @staticmethod
124 |     def is_mixed_ENCN(text):
125 |         def is_chinese(char) -> bool:
126 |             return char.isdigit() or ('\u4e00' <= char <= '\u9fa5') or char in ['\u3002','\uff1b','\uff0c','\uff1a','\u201c','\u201d','\uff08','\uff09','\u3001','\uff1f','\u300a','\u300b']
127 |         def is_mixed_seq(seq):
128 |             sub = np.array(list(map(is_chinese, seq)), dtype=int)
129 |             if np.sum(np.abs(sub[1:]-sub[:-1]))>=6:
130 |                 # print('ilegal:', seq)
131 |                 return True
132 |             return False
133 |         sample_num = 10 if len(text)>70 else len(text)//10
134 |         starts = np.linspace(0, len(text)-7, num=sample_num, endpoint=True, dtype=int)
135 |         for start in starts:
136 |             if is_mixed_seq(text[start:start+7]):
137 |                 return True
138 |         return False
139 | 
140 | 
141 |     
142 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/requirements/conda_dpo_requirements.txt:
--------------------------------------------------------------------------------
  1 | name: dpo
  2 | channels:
  3 |   - defaults
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=main
  6 |   - _openmp_mutex=5.1=1_gnu
  7 |   - bzip2=1.0.8=h5eee18b_6
  8 |   - ca-certificates=2024.7.2=h06a4308_0
  9 |   - ld_impl_linux-64=2.38=h1181459_1
 10 |   - libffi=3.4.4=h6a678d5_1
 11 |   - libgcc-ng=11.2.0=h1234567_1
 12 |   - libgomp=11.2.0=h1234567_1
 13 |   - libstdcxx-ng=11.2.0=h1234567_1
 14 |   - libuuid=1.41.5=h5eee18b_0
 15 |   - ncurses=6.4=h6a678d5_0
 16 |   - openssl=3.0.14=h5eee18b_0
 17 |   - pip=24.2=py310h06a4308_0
 18 |   - python=3.10.13=h955ad1f_0
 19 |   - readline=8.2=h5eee18b_0
 20 |   - setuptools=72.1.0=py310h06a4308_0
 21 |   - sqlite=3.45.3=h5eee18b_0
 22 |   - tk=8.6.14=h39e8969_0
 23 |   - wheel=0.43.0=py310h06a4308_0
 24 |   - xz=5.4.6=h5eee18b_1
 25 |   - zlib=1.2.13=h5eee18b_1
 26 |   - pip:
 27 |       - accelerate==0.28.0
 28 |       - aiohappyeyeballs==2.3.5
 29 |       - aiohttp==3.10.3
 30 |       - aiosignal==1.3.1
 31 |       - annotated-types==0.7.0
 32 |       - anyio==4.4.0
 33 |       - async-timeout==4.0.3
 34 |       - attrs==24.2.0
 35 |       - cachetools==5.4.0
 36 |       - certifi==2024.7.4
 37 |       - charset-normalizer==3.3.2
 38 |       - click==8.1.7
 39 |       - cloudpickle==3.0.0
 40 |       - cmake==3.30.2
 41 |       - datasets==2.21.0
 42 |       - deepspeed==0.14.5
 43 |       - dill==0.3.8
 44 |       - diskcache==5.6.3
 45 |       - distro==1.9.0
 46 |       - docker-pycreds==0.4.0
 47 |       - docstring-parser==0.16
 48 |       - einops==0.8.0
 49 |       - exceptiongroup==1.2.2
 50 |       - fastapi==0.112.0
 51 |       - fastchat==0.1.0
 52 |       - filelock==3.15.4
 53 |       - flash-attn==2.6.3
 54 |       - frozenlist==1.4.1
 55 |       - fsspec==2024.6.1
 56 |       - gitdb==4.0.11
 57 |       - gitpython==3.1.43
 58 |       - h11==0.14.0
 59 |       - hjson==3.1.0
 60 |       - httpcore==1.0.5
 61 |       - httptools==0.6.1
 62 |       - httpx==0.27.0
 63 |       - huggingface-hub==0.24.5
 64 |       - idna==3.7
 65 |       - interegular==0.3.3
 66 |       - jinja2==3.1.4
 67 |       - jiter==0.5.0
 68 |       - joblib==1.4.2
 69 |       - jsonschema==4.23.0
 70 |       - jsonschema-specifications==2023.12.1
 71 |       - lark==1.2.2
 72 |       - llvmlite==0.43.0
 73 |       - lm-format-enforcer==0.10.1
 74 |       - loguru==0.7.2
 75 |       - markdown-it-py==3.0.0
 76 |       - markupsafe==2.1.5
 77 |       - mdurl==0.1.2
 78 |       - mpmath==1.3.0
 79 |       - msgpack==1.0.8
 80 |       - multidict==6.0.5
 81 |       - multiprocess==0.70.16
 82 |       - nest-asyncio==1.6.0
 83 |       - networkx==3.3
 84 |       - ninja==1.11.1.1
 85 |       - numba==0.60.0
 86 |       - numpy==1.26.4
 87 |       - nvidia-cublas-cu12==12.1.3.1
 88 |       - nvidia-cuda-cupti-cu12==12.1.105
 89 |       - nvidia-cuda-nvrtc-cu12==12.1.105
 90 |       - nvidia-cuda-runtime-cu12==12.1.105
 91 |       - nvidia-cudnn-cu12==8.9.2.26
 92 |       - nvidia-cufft-cu12==11.0.2.54
 93 |       - nvidia-curand-cu12==10.3.2.106
 94 |       - nvidia-cusolver-cu12==11.4.5.107
 95 |       - nvidia-cusparse-cu12==12.1.0.106
 96 |       - nvidia-ml-py==12.535.161
 97 |       - nvidia-nccl-cu12==2.20.5
 98 |       - nvidia-nvjitlink-cu12==12.6.20
 99 |       - nvidia-nvtx-cu12==12.1.105
100 |       - nvitop==1.3.2
101 |       - openai==1.40.6
102 |       - outlines==0.0.34
103 |       - packaging==24.1
104 |       - pandas==2.2.2
105 |       - platformdirs==4.2.2
106 |       - prometheus-client==0.20.0
107 |       - prometheus-fastapi-instrumentator==7.0.0
108 |       - protobuf==5.27.3
109 |       - psutil==6.0.0
110 |       - py-cpuinfo==9.0.0
111 |       - pyarrow==17.0.0
112 |       - pycryptodome==3.20.0
113 |       - pydantic==2.8.2
114 |       - pydantic-core==2.20.1
115 |       - pygments==2.18.0
116 |       - python-dateutil==2.9.0.post0
117 |       - python-dotenv==1.0.1
118 |       - pytz==2024.1
119 |       - pyyaml==6.0.2
120 |       - ray==2.34.0
121 |       - referencing==0.35.1
122 |       - regex==2024.7.24
123 |       - requests==2.32.3
124 |       - rich==13.7.1
125 |       - rpds-py==0.20.0
126 |       - safetensors==0.4.4
127 |       - scipy==1.14.0
128 |       - sentencepiece==0.2.0
129 |       - sentry-sdk==2.13.0
130 |       - setproctitle==1.3.3
131 |       - shtab==1.7.1
132 |       - six==1.16.0
133 |       - smmap==5.0.1
134 |       - sniffio==1.3.1
135 |       - starlette==0.37.2
136 |       - sympy==1.13.2
137 |       - termcolor==2.4.0
138 |       - tiktoken==0.7.0
139 |       - tokenizers==0.15.2
140 |       - torch==2.3.0
141 |       - tqdm==4.66.5
142 |       - transformers==4.38.2
143 |       - triton==2.3.0
144 |       - trl==0.9.6
145 |       - typing-extensions==4.12.2
146 |       - tyro==0.8.8
147 |       - tzdata==2024.1
148 |       - urllib3==2.2.2
149 |       - uvicorn==0.30.6
150 |       - uvloop==0.19.0
151 |       - vllm==0.4.3
152 |       - vllm-flash-attn==2.5.8.post2
153 |       - wandb==0.17.7
154 |       - watchfiles==0.23.0
155 |       - websockets==12.0
156 |       - xformers==0.0.26.post1
157 |       - xxhash==3.5.0
158 |       - yarl==1.9.4
159 | prefix: /mnt/lptest/Miniconda3/envs/dpo
160 | 


--------------------------------------------------------------------------------
/codes_datasets/Postraining_dpo/xllm/llama_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple
  2 | import logging
  3 | import torch
  4 | import transformers
  5 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
  6 | from einops import rearrange
  7 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
  8 | from flash_attn.bert_padding import unpad_input, pad_input
  9 | from loguru import logger
 10 | 
 11 | def forward(
 12 |     self,
 13 |     hidden_states: torch.Tensor,
 14 |     attention_mask: Optional[torch.Tensor] = None,
 15 |     position_ids: Optional[torch.Tensor] = None,
 16 |     past_key_value: Optional[Tuple[torch.Tensor]] = None,
 17 |     output_attentions: bool = False,
 18 |     use_cache: bool = False,
 19 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 20 |     """Input shape: Batch x Time x Channel
 21 | 
 22 |     attention_mask: [bsz, q_len]
 23 |     """
 24 |     bsz, q_len, _ = hidden_states.size()
 25 | 
 26 |     query_states = (
 27 |         self.q_proj(hidden_states)
 28 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 29 |         .transpose(1, 2)
 30 |     )
 31 |     key_states = (
 32 |         self.k_proj(hidden_states)
 33 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 34 |         .transpose(1, 2)
 35 |     )
 36 |     value_states = (
 37 |         self.v_proj(hidden_states)
 38 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 39 |         .transpose(1, 2)
 40 |     )
 41 |     # [bsz, q_len, nh, hd]
 42 |     # [bsz, nh, q_len, hd]
 43 | 
 44 |     kv_seq_len = key_states.shape[-2]
 45 |     assert past_key_value is None, "past_key_value is not supported"
 46 | 
 47 |     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 48 |     query_states, key_states = apply_rotary_pos_emb(
 49 |         query_states, key_states, cos, sin, position_ids
 50 |     )
 51 |     # [bsz, nh, t, hd]
 52 |     assert not output_attentions, "output_attentions is not supported"
 53 |     assert not use_cache, "use_cache is not supported"
 54 | 
 55 |     # Flash attention codes from
 56 |     # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
 57 | 
 58 |     # transform the data into the format required by flash attention
 59 |     qkv = torch.stack(
 60 |         [query_states, key_states, value_states], dim=2
 61 |     )  # [bsz, nh, 3, q_len, hd]
 62 |     qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
 63 |     # We have disabled _prepare_decoder_attention_mask in LlamaModel
 64 |     # the attention_mask should be the same as the key_padding_mask
 65 |     key_padding_mask = attention_mask
 66 | 
 67 |     if key_padding_mask is None:
 68 |         qkv = rearrange(qkv, "b s ... -> (b s) ...")
 69 |         max_s = q_len
 70 |         cu_q_lens = torch.arange(
 71 |             0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
 72 |         )
 73 |         print("cu_q_lens:", cu_q_lens.size())
 74 |         output = flash_attn_varlen_qkvpacked_func(
 75 |             qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 76 |         )
 77 |         output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
 78 |     else:
 79 |         nheads = qkv.shape[-2]
 80 |         x = rearrange(qkv, "b s three h d -> b s (three h d)")
 81 |         x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
 82 |         x_unpad = rearrange(
 83 |             x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
 84 |         )
 85 |         output_unpad = flash_attn_varlen_qkvpacked_func(
 86 |             x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 87 |         )
 88 |         output = rearrange(
 89 |             pad_input(
 90 |                 rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
 91 |             ),
 92 |             "b s (h d) -> b s h d",
 93 |             h=nheads,
 94 |         )
 95 |     return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, None
 96 | 
 97 | 
 98 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
 99 | # requires the attention mask to be the same as the key_padding_mask
100 | def _prepare_decoder_attention_mask(
101 |     self, attention_mask, input_shape, inputs_embeds, past_key_values_length
102 | ):
103 |     # [bsz, seq_len]
104 |     return attention_mask
105 | 
106 | 
107 | def replace_llama_attn_with_flash_attn():
108 |     cuda_major, cuda_minor = torch.cuda.get_device_capability()
109 |     if cuda_major < 8:
110 |         logging.warning(
111 |             "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
112 |             "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
113 |         )
114 |     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
115 |         _prepare_decoder_attention_mask
116 |     )
117 |     logger.warning("Replace with flash_attention")
118 |     transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
119 | 


--------------------------------------------------------------------------------
/codes_datasets/DataCleaning/utils/ocr_nlp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import sys
  4 | import json
  5 | import hashlib
  6 | import re
  7 | import multiprocessing as mp
  8 | import argparse
  9 | from os import listdir
 10 | from tqdm import tqdm
 11 | from util import load_set_from_txt
 12 | 
 13 | PROCESS = 2
 14 | 
 15 | def extract_sentences_with_colon(text):
 16 |     sentence_delimiters = r'[,.，。；！？]'
 17 |     sentences = re.split(sentence_delimiters, text)
 18 |     extracted_sentences = []
 19 |     remaining_text = ""
 20 | 
 21 |     for sentence in sentences:
 22 |         if ':' in sentence or '：' in sentence:
 23 |             extracted_sentences.append(sentence.strip())
 24 |         else:
 25 |             remaining_text += sentence.strip() + " "
 26 | 
 27 |     return extracted_sentences, remaining_text.strip()
 28 | 
 29 | def controller(input_file):
 30 |     for line in open(input_file,'r',encoding="utf-8"):
 31 |         line = line.strip()
 32 |         if len(line) < 5:continue
 33 | 
 34 |         js_dict = json.loads(line)
 35 |         item_id = js_dict["item_id"]
 36 |         ocr_ret_list = js_dict["ocr_ret"]
 37 |         ocr_text = ""
 38 |         for item in ocr_ret_list:
 39 |             image_name = item["img_name"]
 40 |             ocr_ret = item["ocr_ret"]
 41 |             one_img_content = concat_one_img(ocr_ret)
 42 |             if len(one_img_content) < 5:continue
 43 |             if ocr_text != "": ocr_text += "。"
 44 |             ocr_text += one_img_content
 45 |         yield item_id, ocr_text
 46 | 
 47 | #reload(sys)
 48 | #sys.setdefaultencoding('utf-8')
 49 | 
 50 | '''
 51 | {"ocr_ret": [{"img_name": "/vmware_data/gaodiqi/jingdong_imgs/100000040875/detailimg_e6d026e0d93c15d7174425f9c778eb7e.jpg", "ocr_ret": [{"index": [[248.0, 172.0], [619.0, 172.0], [619.0, 217.0], [248.0, 217.0]], "content": "      年风霜，匠心如初", "confidence": 0.9926699995994568}, {"index": [[58.0, 284.0], [741.0, 283.0], [741.0, 324.0], [58.0, 325.0]], "content": "品质依然，福东海健康食材的选择", "confidence": 0.9772330522537231}, {"index": [[246.      0, 351.0], [306.0, 351.0], [306.0, 371.0], [246.0, 371.0]], "content": "黄民", "confidence": 0.8339320421218872}, {"index": [[406.0, 352.0], [470.0, 349.0], [471.0, 369.0], [407.0, 372.0]], "content": "胎菊", "confidence": 0      .8963175415992737}, {"index": [[571.0, 351.0], [631.0, 351.0], [631.0, 371.0], [571.0, 371.0]], "content": "贡菊", "confidence": 0.8061279058456421},
 52 | '''
 53 | def concat_one_img(ocr_ret_list):
 54 | 
 55 |     ans = ""
 56 |     duplicate_set = set()
 57 |     for item in ocr_ret_list:
 58 |         index = item["index"]
 59 |         content = item["content"].strip()
 60 |         if len(content) < 1:continue
 61 |         md5 = hashlib.md5(content.encode('utf-8')).hexdigest()
 62 |         #print("md5:",md5)
 63 |         if md5 in duplicate_set: continue
 64 |         duplicate_set.add(md5)
 65 |         if ans != "": ans = "，"
 66 |         ans += content
 67 |     return ans
 68 | 
 69 | def extract(input):
 70 |     item_id, ocr_text = input
 71 |     pairs, text = [], ''
 72 |     if len(ocr_text) > 10:
 73 |         pairs, text = extract_sentences_with_colon(ocr_text)
 74 |     output = {
 75 |         "id": item_id,
 76 |         "source": "OCR",
 77 |         "source_id":"",
 78 |         "content": {"pairs": pairs,"text": text, "qa":""}
 79 |     }
 80 |     return output
 81 |     
 82 | def HandleSingleFile(input_file, output):
 83 |     pools = mp.Pool(PROCESS)
 84 | 
 85 |     flush_steps = 0
 86 |     flush_per_steps = 50
 87 |     for res in pools.imap(extract, controller(input_file)):
 88 |         if res is not None:
 89 |             jstr = json.dumps(res, ensure_ascii=False)
 90 |             output.write(jstr+"\n")
 91 |             flush_steps += 1
 92 |             if flush_steps % flush_per_steps == 0:
 93 |                 output.flush()
 94 | 
 95 | def parse_args():
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument('--source_path',
 98 |                         type=str,
 99 |                         default="/root/llm/source_data/cn-JD-ocrtext/",
100 |                         help='Directory containing trained actor model')
101 |     parser.add_argument('--dest_path',
102 |                         type=str,
103 |                         default="/root/llm/clean_data/cn-JD-ocrtext/",
104 |                         help='Directory containing trained actor model')
105 |     args = parser.parse_args()
106 |     return args
107 | 
108 | if __name__ == "__main__":
109 |     args = parse_args()
110 |     files = sorted(listdir(args.source_path))
111 | 
112 |     Output_Dir = os.path.join(args.dest_path)
113 | 
114 |     if not os.path.exists(Output_Dir):
115 |         os.makedirs(Output_Dir, exist_ok=True)
116 | 
117 |     for input_file in tqdm(files,total=len(files)):
118 |         input = os.path.join(args.source_path, input_file)
119 |         output_file = os.path.join(Output_Dir, input_file)
120 |         if os.path.exists(output_file): os.remove(output_file)
121 |         output = open(output_file, 'a+', encoding='utf-8')
122 | 
123 |         HandleSingleFile(input, output)
124 | 
125 |         output.close()
126 | 
127 | 


--------------------------------------------------------------------------------