├── tests ├── __init__.py ├── datasets │ ├── __init__.py │ └── test_dataset.py ├── models │ ├── __init__.py │ └── test_auto_model.py ├── utils │ ├── __init__.py │ └── test_data_utils.py └── pipeline │ └── test_auto_pipeline.py ├── src └── lmflow │ ├── utils │ ├── __init__.py │ ├── flash_attention │ │ ├── __init__.py │ │ ├── gpt_neo_flash_attention.py │ │ ├── bloom_flash_attention.py │ │ └── llama_flash_attention.py │ ├── position_interpolation │ │ ├── __init__.py │ │ └── llama_rope_scaled_monkey_patch.py │ └── constants.py │ ├── models │ ├── __init__.py │ ├── interfaces │ │ ├── __init__.py │ │ └── tunable.py │ ├── base_model.py │ ├── regression_model.py │ ├── decoder_model.py │ ├── encoder_decoder_model.py │ ├── auto_model.py │ └── text_regression_model.py │ ├── pipeline │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── peft_trainer.py │ ├── base_pipeline.py │ ├── base_tuner.py │ ├── base_aligner.py │ └── auto_pipeline.py │ ├── version.py │ ├── datasets │ └── __init__.py │ └── __init__.py ├── scripts ├── bash.sh ├── run_unittest.sh ├── run_app.sh ├── vocab_extension │ ├── convert_json_to_txt.sh │ ├── merge_tokenizer.sh │ ├── train_tokenizer.sh │ └── train_merge_tokenizer.sh ├── run_evaluation.sh ├── run_vis_chatbot_blip2.sh ├── run_chatbot.sh ├── run_chatbot_cpu.sh ├── run_chatbot_chatglm.sh ├── run_inference_multimodal_model.sh ├── run_evaluation_accelerator.sh ├── run_evaluation_with_lora.sh ├── run_benchmark.sh ├── run_vis_chatbot_minigpt4.sh ├── .nfs0000000094418362000004c4 ├── run_finetune.sh ├── run_finetune_with_lora.sh ├── run_multistage_finetune.sh ├── run_reward_modeling.sh ├── run_finetune_with_lora_save_aggregated_weights.sh ├── run_raft_align.sh ├── data_preprocess │ ├── run_data_preprocess.sh │ ├── count.py │ ├── shuffle.py │ ├── sample.py │ ├── add_prompt.py │ ├── concat.py │ ├── add_end_mark.py │ ├── merge.py │ └── concat_shuffle_split.py ├── run_all_benchmark.sh ├── run_vis_chatbot_gradio_minigpt4.sh └── export_llama_state_dict_checkpoint.py ├── assets ├── logo.png ├── features.png ├── robin13b.png ├── robin33b.png ├── robin65b.png ├── robin7b.jpg ├── robin7b_.png ├── Cockatoo3b.png ├── Cockatoo7b.png ├── Parakeets.png ├── robin13b_.jpg ├── robin33b_.png ├── robin65b_.png ├── colab-shell-chatbot-demo.png └── multimodal-chatbot-demo.gif ├── docs ├── source │ ├── _static │ │ ├── eq.png │ │ ├── logo.png │ │ ├── nll.png │ │ ├── ppl.png │ │ ├── raft.png │ │ ├── IT_sample1.png │ │ ├── IT_sample2.png │ │ ├── IT_sample3.png │ │ ├── IT_sample4.png │ │ ├── IT_sample5.png │ │ ├── IT_sample6.png │ │ ├── IT_sample7.png │ │ ├── raft_idea.PNG │ │ ├── benchmark-1.png │ │ ├── benchmark-2.png │ │ ├── raft_reward.PNG │ │ ├── raft-demo-examples.png │ │ ├── logo5.svg │ │ ├── logo4.svg │ │ ├── logo6.svg │ │ ├── logo.svg │ │ ├── logo2.svg │ │ └── logo3.svg │ ├── about │ │ ├── authors.md │ │ ├── index.md │ │ └── changelog.md │ ├── blogs │ │ └── index.md │ ├── api │ │ └── _autosummary │ │ │ └── lmflow.args.rst │ ├── examples │ │ ├── index.md │ │ ├── checkpoints.md │ │ ├── medical_finetune.md │ │ ├── DATASETS.md │ │ └── TASK_GUIDE.md │ └── conf.py └── requirements.txt ├── service ├── static │ └── assets │ │ ├── logo.png │ │ └── background.png └── app.py ├── .github ├── ISSUE_TEMPLATE │ ├── blank-template.md │ ├── api-feedback.md │ ├── feature-request.md │ └── bug-report.md └── workflows │ └── documentation.yaml ├── .gitattributes ├── examples ├── ds_config.json ├── merge_lora.py ├── evaluation.py ├── finetune.py ├── chatbot.py └── raft_align.py ├── configs ├── ds_config_eval.json ├── ds_config_chatbot.json ├── ds_config_multimodal.json ├── accelerator_singlegpu_config.yaml ├── accelerator_multigpu_config.yaml ├── ds_config_zero3_for_eval.json ├── ds_config_zero2.json └── ds_config_zero3.json ├── docker ├── Dockerfile └── README.md ├── requirements.txt ├── CONTRIBUTING.md ├── utils ├── train_tokenizer.py ├── convert_minigpt4_checkpoints.py ├── convert_json_to_txt.py ├── make_delta.py ├── merge_tokenizer.py └── lm_evaluator.py ├── setup.py ├── experimental └── RAFT-diffusion │ ├── requirements.txt │ └── README.md ├── .gitignore ├── output_models └── download.sh └── CODE_OF_CONDUCT.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lmflow/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/lmflow/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lmflow/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lmflow/pipeline/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lmflow/models/interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lmflow/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" -------------------------------------------------------------------------------- /src/lmflow/utils/flash_attention/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lmflow/utils/position_interpolation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/bash.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Shell and python scripts goes here -------------------------------------------------------------------------------- /scripts/run_unittest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m unittest discover 4 | -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/logo.png -------------------------------------------------------------------------------- /assets/features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/features.png -------------------------------------------------------------------------------- /assets/robin13b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin13b.png -------------------------------------------------------------------------------- /assets/robin33b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin33b.png -------------------------------------------------------------------------------- /assets/robin65b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin65b.png -------------------------------------------------------------------------------- /assets/robin7b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin7b.jpg -------------------------------------------------------------------------------- /assets/robin7b_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin7b_.png -------------------------------------------------------------------------------- /assets/Cockatoo3b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/Cockatoo3b.png -------------------------------------------------------------------------------- /assets/Cockatoo7b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/Cockatoo7b.png -------------------------------------------------------------------------------- /assets/Parakeets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/Parakeets.png -------------------------------------------------------------------------------- /assets/robin13b_.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin13b_.jpg -------------------------------------------------------------------------------- /assets/robin33b_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin33b_.png -------------------------------------------------------------------------------- /assets/robin65b_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin65b_.png -------------------------------------------------------------------------------- /docs/source/_static/eq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/eq.png -------------------------------------------------------------------------------- /docs/source/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/logo.png -------------------------------------------------------------------------------- /docs/source/_static/nll.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/nll.png -------------------------------------------------------------------------------- /docs/source/_static/ppl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/ppl.png -------------------------------------------------------------------------------- /docs/source/_static/raft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/raft.png -------------------------------------------------------------------------------- /service/static/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/service/static/assets/logo.png -------------------------------------------------------------------------------- /docs/source/_static/IT_sample1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample1.png -------------------------------------------------------------------------------- /docs/source/_static/IT_sample2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample2.png -------------------------------------------------------------------------------- /docs/source/_static/IT_sample3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample3.png -------------------------------------------------------------------------------- /docs/source/_static/IT_sample4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample4.png -------------------------------------------------------------------------------- /docs/source/_static/IT_sample5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample5.png -------------------------------------------------------------------------------- /docs/source/_static/IT_sample6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample6.png -------------------------------------------------------------------------------- /docs/source/_static/IT_sample7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample7.png -------------------------------------------------------------------------------- /docs/source/_static/raft_idea.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/raft_idea.PNG -------------------------------------------------------------------------------- /assets/colab-shell-chatbot-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/colab-shell-chatbot-demo.png -------------------------------------------------------------------------------- /docs/source/_static/benchmark-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/benchmark-1.png -------------------------------------------------------------------------------- /docs/source/_static/benchmark-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/benchmark-2.png -------------------------------------------------------------------------------- /docs/source/_static/raft_reward.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/raft_reward.PNG -------------------------------------------------------------------------------- /service/static/assets/background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/service/static/assets/background.png -------------------------------------------------------------------------------- /docs/source/_static/raft-demo-examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/raft-demo-examples.png -------------------------------------------------------------------------------- /docs/source/about/authors.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | 4 | Shizhe Diao, Rui Pan, Hanze Dong, Ka Shun Shum, Jipeng Zhang, Wei Xiong, Tong Zhang 5 | -------------------------------------------------------------------------------- /docs/source/blogs/index.md: -------------------------------------------------------------------------------- 1 | # Blogs 2 | 3 | ## 2023 4 | 5 | 6 | ```{toctree} 7 | :maxdepth: 1 8 | 9 | benchmark 10 | ``` 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/blank-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Blank Template 3 | about: Other issues 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /assets/multimodal-chatbot-demo.gif: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:206296519e7892d65cacc48c7e98c6743301b74c29401d57e325197bd6e41cac 3 | size 79864304 4 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==5.3.0 2 | pydata-sphinx-theme==0.13.1 3 | sphinx_design==0.3.0 4 | myst-parser==1.0.0 5 | sphinx-autoapi==2.0.0 6 | matplotlib==3.4.1 7 | numpydoc==0.9.1 -------------------------------------------------------------------------------- /docs/source/about/index.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | 4 | ```{toctree} 5 | :maxdepth: 2 6 | 7 | changelog 8 | ``` 9 | 10 | 11 | ```{toctree} 12 | :maxdepth: 2 13 | 14 | authors 15 | ``` 16 | -------------------------------------------------------------------------------- /src/lmflow/models/interfaces/tunable.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """Tunable class 4 | """ 5 | 6 | from abc import ABC 7 | 8 | 9 | class Tunable(ABC): 10 | pass 11 | -------------------------------------------------------------------------------- /src/lmflow/pipeline/base_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """ BasePipeline. 4 | """ 5 | 6 | from abc import ABC # abstract class 7 | 8 | class BasePipeline(ABC): 9 | pass 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/api-feedback.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: API Feedback 3 | about: Provide feedback regarding the current design of the API. 4 | title: "[API Design]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /scripts/run_app.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singlegpu_config.yaml service/app.py \ 4 | --model_name_or_path gpt2 \ 5 | --torch_dtype bfloat16 \ 6 | --max_new_tokens 200 -------------------------------------------------------------------------------- /src/lmflow/models/base_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """Base model class. 4 | """ 5 | 6 | from abc import ABC 7 | 8 | 9 | class BaseModel(ABC): 10 | 11 | def __init__(self, *args, **kwargs): 12 | pass 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.html linguist-detectable=false 2 | *.js linguist-detectable=false 3 | *.ipynb linguist-detectable=false 4 | *RAFT.pdf filter=lfs diff=lfs merge=lfs -text 5 | *.gif filter=lfs diff=lfs merge=lfs -text 6 | assets/*.gif filter=lfs diff=lfs merge=lfs -text 7 | -------------------------------------------------------------------------------- /examples/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": false 4 | }, 5 | "bf16": { 6 | "enabled": true 7 | }, 8 | "steps_per_print": 2000, 9 | "train_micro_batch_size_per_gpu": 1, 10 | "wall_clock_breakdown": false 11 | } 12 | -------------------------------------------------------------------------------- /configs/ds_config_eval.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": false 4 | }, 5 | "bf16": { 6 | "enabled": false 7 | }, 8 | "steps_per_print": 2000, 9 | "train_micro_batch_size_per_gpu": 1, 10 | "wall_clock_breakdown": false 11 | } 12 | -------------------------------------------------------------------------------- /scripts/vocab_extension/convert_json_to_txt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd data && bash download.sh wiki_zh_eval && cd - 4 | 5 | python utils/convert_json_to_txt.py --dataset_path ./data/wiki_zh_eval \ 6 | --output_path ./data/wiki_zh_eval/converted_data.txt \ 7 | --overwrite True -------------------------------------------------------------------------------- /src/lmflow/models/regression_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """General regression model.""" 4 | 5 | from lmflow.models.base_model import BaseModel 6 | 7 | 8 | class RegressionModel(BaseModel): 9 | 10 | def __init__(self, *args, **kwargs): 11 | pass 12 | -------------------------------------------------------------------------------- /scripts/vocab_extension/merge_tokenizer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p ./output_models/new_tokenizer 3 | python utils/merge_tokenizer.py --tokenizer_dir pinkmanlove/llama-7b-hf \ 4 | --chinese_sp_model_file ./output_models/new_tokenizer/example.model \ 5 | --output_dir ./output_models/merged_tokenizer \ -------------------------------------------------------------------------------- /docs/source/about/changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | 4 | ## Version 0.0.1 (Mar 28, 2023) 5 | 6 | The first public version. 7 | 8 | Task tuning, instruction tuning, on user defined datasets. 9 | 10 | A simple and extensible API for developers. 11 | 12 | Efficient finetuning with LoRA. 13 | 14 | Simplified model inference framework. 15 | 16 | -------------------------------------------------------------------------------- /scripts/vocab_extension/train_tokenizer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p ./output_models/merged_tokenizer 3 | python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_data.txt \ 4 | --model_type bpe \ 5 | --output_dir ./output_models/new_tokenizer \ 6 | --user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \ 7 | --vocab_size 20000 -------------------------------------------------------------------------------- /scripts/run_evaluation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0 \ 4 | deepspeed examples/evaluation.py \ 5 | --answer_type medmcqa \ 6 | --model_name_or_path gpt2-large \ 7 | --dataset_path data/MedQA-USMLE/validation \ 8 | --deepspeed examples/ds_config.json \ 9 | --inference_batch_size_per_device 1 \ 10 | --metric accuracy 11 | -------------------------------------------------------------------------------- /scripts/run_vis_chatbot_blip2.sh: -------------------------------------------------------------------------------- 1 | model=Salesforce/blip2-opt-2.7b 2 | deepspeed examples/vis_chatbot.py --model_name_or_path ${model} \ 3 | --deepspeed configs/ds_config_multimodal.json \ 4 | --arch_type vision_encoder_decoder \ 5 | --task vqa \ 6 | ${@:1} 7 | -------------------------------------------------------------------------------- /scripts/run_chatbot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model=gpt2 4 | lora_args="" 5 | if [ $# -ge 1 ]; then 6 | model=$1 7 | fi 8 | if [ $# -ge 2 ]; then 9 | lora_args="--lora_model_path $2" 10 | fi 11 | 12 | CUDA_VISIBLE_DEVICES=0 \ 13 | deepspeed examples/chatbot.py \ 14 | --deepspeed configs/ds_config_chatbot.json \ 15 | --model_name_or_path ${model} \ 16 | ${lora_args} 17 | -------------------------------------------------------------------------------- /scripts/run_chatbot_cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model=gpt2 4 | lora_args="" 5 | if [ $# -ge 1 ]; then 6 | model=$1 7 | fi 8 | if [ $# -ge 2 ]; then 9 | lora_args="--lora_model_path $2" 10 | fi 11 | 12 | CUDA_VISIBLE_DEVICES="" \ 13 | python examples/chatbot.py \ 14 | --deepspeed configs/ds_config_chatbot.json \ 15 | --model_name_or_path ${model} \ 16 | --device "cpu" \ 17 | ${lora_args} 18 | -------------------------------------------------------------------------------- /src/lmflow/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """This Python code defines a class Dataset with methods for initializing, loading, 2 | and manipulating datasets from different backends such as Hugging Face and JSON. 3 | 4 | The `Dataset` class includes methods for loading datasets from a dictionary and a Hugging 5 | Face dataset, mapping datasets, and retrieving the backend dataset and arguments. 6 | """ 7 | from lmflow.datasets.dataset import Dataset 8 | -------------------------------------------------------------------------------- /configs/ds_config_chatbot.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": false 4 | }, 5 | "bf16": { 6 | "enabled": true 7 | }, 8 | "comms_logger": { 9 | "enabled": false, 10 | "verbose": false, 11 | "prof_all": false, 12 | "debug": false 13 | }, 14 | "steps_per_print": 20000000000000000, 15 | "train_micro_batch_size_per_gpu": 1, 16 | "wall_clock_breakdown": false 17 | } 18 | -------------------------------------------------------------------------------- /configs/ds_config_multimodal.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": false 4 | }, 5 | "bf16": { 6 | "enabled": false 7 | }, 8 | "comms_logger": { 9 | "enabled": false, 10 | "verbose": false, 11 | "prof_all": false, 12 | "debug": false 13 | }, 14 | "steps_per_print": 20000000000000000, 15 | "train_micro_batch_size_per_gpu": 1, 16 | "wall_clock_breakdown": false 17 | } 18 | -------------------------------------------------------------------------------- /docs/source/api/_autosummary/lmflow.args.rst: -------------------------------------------------------------------------------- 1 | lmflow.args 2 | =========== 3 | 4 | .. automodule:: lmflow.args 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | .. rubric:: Classes 17 | 18 | .. autosummary:: 19 | 20 | DatasetArguments 21 | FinetunerArguments 22 | InferencerArguments 23 | ModelArguments 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /scripts/run_chatbot_chatglm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model=THUDM/chatglm-6b 4 | lora_args="" 5 | if [ $# -ge 1 ]; then 6 | model=$1 7 | fi 8 | if [ $# -ge 2 ]; then 9 | lora_args="--lora_model_path $2" 10 | fi 11 | 12 | CUDA_VISIBLE_DEVICES=0 \ 13 | deepspeed examples/chatbot.py \ 14 | --arch_type encoder_decoder \ 15 | --deepspeed configs/ds_config_chatbot.json \ 16 | --model_name_or_path ${model} \ 17 | ${lora_args} -------------------------------------------------------------------------------- /configs/accelerator_singlegpu_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | distributed_type: 'NO' 3 | downcast_bf16: 'no' 4 | dynamo_config: 5 | dynamo_backend: INDUCTOR 6 | gpu_ids: 7 | machine_rank: 0 8 | main_training_function: main 9 | mixed_precision: bf16 10 | num_machines: 1 11 | num_processes: 1 12 | rdzv_backend: static 13 | same_network: true 14 | tpu_env: [] 15 | tpu_use_cluster: false 16 | tpu_use_sudo: false 17 | use_cpu: false 18 | -------------------------------------------------------------------------------- /configs/accelerator_multigpu_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | distributed_type: MULTI_GPU 3 | downcast_bf16: 'no' 4 | dynamo_config: 5 | dynamo_backend: INDUCTOR 6 | gpu_ids: 7 | machine_rank: 0 8 | main_training_function: main 9 | mixed_precision: bf16 10 | num_machines: 1 11 | num_processes: 2 12 | rdzv_backend: static 13 | same_network: true 14 | tpu_env: [] 15 | tpu_use_cluster: false 16 | tpu_use_sudo: false 17 | use_cpu: false 18 | main_process_port: 11000 19 | -------------------------------------------------------------------------------- /scripts/run_inference_multimodal_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model="Salesforce/blip-image-captioning-base" 4 | lora_args="" 5 | if [ $# -ge 1 ]; then 6 | model=$1 7 | fi 8 | if [ $# -ge 2 ]; then 9 | lora_args="--lora_model_path $2" 10 | fi 11 | 12 | CUDA_VISIBLE_DEVICES=0 \ 13 | deepspeed examples/inference.py \ 14 | --deepspeed configs/ds_config_multimodal.json \ 15 | --model_name_or_path ${model} \ 16 | --arch_type vision_encoder_decoder \ 17 | ${lora_args} 18 | -------------------------------------------------------------------------------- /src/lmflow/__init__.py: -------------------------------------------------------------------------------- 1 | from .version import __version__ as internal_version 2 | 3 | __version__ = internal_version 4 | 5 | from transformers.utils import check_min_version 6 | from transformers.utils.versions import require_version 7 | 8 | from lmflow import args, datasets, models, pipeline, utils 9 | 10 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 11 | check_min_version("4.27.0.dev0") 12 | 13 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") -------------------------------------------------------------------------------- /scripts/run_evaluation_accelerator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singlegpu_config.yaml examples/evaluation.py \ 4 | --answer_type usmle \ 5 | --model_name_or_path gpt2-large \ 6 | --dataset_path data/MedQA-USMLE/validation \ 7 | --use_ram_optimized_load True \ 8 | --deepspeed examples/ds_config.json \ 9 | --metric accuracy \ 10 | --output_dir output_dir/accelerator_1_card \ 11 | --inference_batch_size_per_device 1 \ 12 | --use_accelerator_for_evaluator True \ 13 | --torch_dtype bfloat16 14 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04 2 | 3 | ENV TZ=Etc/UTC 4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 5 | 6 | RUN apt-get update --fix-missing && apt-get install -y fontconfig --fix-missing 7 | RUN apt-get install -y libopenmpi-dev 8 | RUN apt-get install -y git python3.9 python3.9-dev python3.9-venv 9 | RUN python3.9 -m venv /venv 10 | ENV PATH=/venv/bin:$PATH 11 | RUN pip install mpi4py 12 | 13 | ARG SRCDIR 14 | 15 | RUN mkdir /LMFlow/ 16 | WORKDIR /LMFlow/ 17 | 18 | COPY $SRCDIR/ /LMFlow/ 19 | 20 | RUN pip install wheel 21 | RUN pip install -e . 22 | -------------------------------------------------------------------------------- /src/lmflow/pipeline/base_tuner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """ BaseTuner: a subclass of BasePipeline. 4 | """ 5 | 6 | from lmflow.pipeline.base_pipeline import BasePipeline 7 | 8 | 9 | class BaseTuner(BasePipeline): 10 | """ A subclass of BasePipeline which is tunable. 11 | """ 12 | def __init__(self, *args, **kwargs): 13 | pass 14 | 15 | def _check_if_tunable(self, model, dataset): 16 | # TODO: check if the model is tunable and dataset is compatible 17 | pass 18 | 19 | def tune(self, model, dataset): 20 | raise NotImplementedError(".tune is not implemented") 21 | -------------------------------------------------------------------------------- /src/lmflow/models/decoder_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """A one-line summary of the module or program, terminated by a period. 4 | 5 | Leave one blank line. The rest of this docstring should contain an 6 | overall description of the module or program. Optionally, it may also 7 | contain a brief description of exported classes and functions and/or usage 8 | examples. 9 | 10 | Typical usage example: 11 | 12 | foo = ClassFoo() 13 | bar = foo.FunctionBar() 14 | """ 15 | 16 | from lmflow.models.base_model import BaseModel 17 | 18 | 19 | class DecoderModel(BaseModel): 20 | 21 | def __init__(self, *args, **kwargs): 22 | pass 23 | -------------------------------------------------------------------------------- /src/lmflow/models/encoder_decoder_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """A one-line summary of the module or program, terminated by a period. 4 | 5 | Leave one blank line. The rest of this docstring should contain an 6 | overall description of the module or program. Optionally, it may also 7 | contain a brief description of exported classes and functions and/or usage 8 | examples. 9 | 10 | Typical usage example: 11 | 12 | foo = ClassFoo() 13 | bar = foo.FunctionBar() 14 | """ 15 | 16 | from lmflow.models.base_model import BaseModel 17 | 18 | 19 | class EncoderDecoderModel(BaseModel): 20 | 21 | def __init__(self, *args, **kwargs): 22 | pass -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.24.2 2 | datasets==2.10.1 3 | peft @ git+https://github.com/huggingface/peft.git@deff03f2c251534fffd2511fc2d440e84cc54b1b 4 | torch==2.0.0 5 | wandb==0.14.0 6 | deepspeed==0.8.3 7 | trl @ git+https://github.com/lvwerra/trl.git#egg=trl-0.4.1 8 | sentencepiece 9 | transformers @ git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda 10 | flask 11 | flask_cors 12 | icetk 13 | cpm_kernels==1.0.11 14 | evaluate==0.4.0 15 | scikit-learn==1.2.2 16 | lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@e47e01beea79cfe87421e2dac49e64d499c240b4 17 | dill<0.3.5 18 | bitsandbytes==0.38.1 19 | pydantic<=1.10.9 20 | gradio 21 | -------------------------------------------------------------------------------- /scripts/run_evaluation_with_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # --model_name_or_path specifies the original huggingface model 4 | # --lora_model_path specifies the model difference introduced by finetuning, 5 | # i.e. the one saved by ./scripts/run_finetune_with_lora.sh 6 | CUDA_VISIBLE_DEVICES=0 \ 7 | deepspeed examples/evaluation.py \ 8 | --answer_type text \ 9 | --model_name_or_path facebook/galactica-1.3b \ 10 | --lora_model_path output_models/finetune_with_lora \ 11 | --dataset_path data/alpaca/test \ 12 | --prompt_structure "Input: {input}" \ 13 | --deepspeed examples/ds_config.json \ 14 | --inference_batch_size_per_device 1 \ 15 | --metric accuracy 16 | -------------------------------------------------------------------------------- /scripts/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" == "-h" -o "$1" == "--help" ]; then 4 | help_message="./$(basename $0)" 5 | help_message+=" --dataset_name DATASET_NAME" 6 | help_message+=" --model_name_or_path MODEL_NAME_OR_PATH" 7 | echo ${help_message} 1>&2 8 | exit 1 9 | fi 10 | 11 | extra_args="--dataset_name gpt4_en_eval --model_name_or_path gpt2" 12 | if [ $# -ge 1 ]; then 13 | extra_args="$@" 14 | fi 15 | 16 | 17 | CUDA_VISIBLE_DEVICES=0 \ 18 | deepspeed --master_port 11001 examples/benchmarking.py \ 19 | --use_ram_optimized_load 0 \ 20 | --deepspeed examples/ds_config.json \ 21 | --metric nll \ 22 | --prompt_structure "###Human: {input}###Assistant:" \ 23 | ${extra_args} -------------------------------------------------------------------------------- /src/lmflow/pipeline/base_aligner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """ BaseTuner: a subclass of BasePipeline. 4 | """ 5 | 6 | from lmflow.pipeline.base_pipeline import BasePipeline 7 | 8 | 9 | class BaseAligner(BasePipeline): 10 | """ A subclass of BasePipeline which is alignable. 11 | """ 12 | def __init__(self, *args, **kwargs): 13 | pass 14 | 15 | def _check_if_alignable(self, model, dataset, reward_model): 16 | # TODO: check if the model is alignable and dataset is compatible 17 | # TODO: add reward_model 18 | pass 19 | 20 | def align(self, model, dataset, reward_model): 21 | raise NotImplementedError(".align is not implemented") 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Suggest an idea for this project 4 | title: "[New Feature]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yaml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | on: [push, pull_request, workflow_dispatch] 3 | jobs: 4 | docs: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v3 8 | - uses: actions/setup-python@v3 9 | - name: Install current pkg 10 | run: | 11 | pip install -e . 12 | - name: Install dependencies 13 | run: | 14 | pip install -r ./docs/requirements.txt 15 | - name: Sphinx build 16 | run: | 17 | sphinx-build docs/source _build 18 | - name: Deploy 19 | uses: peaceiris/actions-gh-pages@v3 20 | with: 21 | publish_branch: gh-pages 22 | github_token: ${{ secrets.GITHUB_TOKEN }} 23 | publish_dir: _build/ 24 | force_orphan: true 25 | -------------------------------------------------------------------------------- /docs/source/examples/index.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | We provide several examples to show how to use our package in your problem. 4 | 5 | ## Data preparation 6 | 7 | ```{toctree} 8 | :maxdepth: 3 9 | 10 | DATASETS 11 | ``` 12 | 13 | ```{toctree} 14 | :maxdepth: 3 15 | 16 | checkpoints 17 | ``` 18 | 19 | ## Finetuning 20 | 21 | For SFT, Refer to [examples](https://github.com/OptimalScale/LMFlow/blob/main/examples). 22 | 23 | 24 | For alignment process, 25 | 26 | ```{toctree} 27 | :maxdepth: 3 28 | 29 | reward_modeling 30 | ``` 31 | 32 | 33 | ```{toctree} 34 | :maxdepth: 3 35 | 36 | raft 37 | ``` 38 | 39 | ## Inference 40 | 41 | Refer to [examples](https://github.com/OptimalScale/LMFlow/blob/main/examples). 42 | 43 | ## Evaluation 44 | 45 | ```{toctree} 46 | :maxdepth: 3 47 | 48 | TASK_GUIDE 49 | ``` 50 | 51 | 52 | -------------------------------------------------------------------------------- /scripts/run_vis_chatbot_minigpt4.sh: -------------------------------------------------------------------------------- 1 | model=Salesforce/blip2-flan-t5-xxl 2 | checkpoint_path=/scratch/PI/tongzhang/qinglian/checkpoints/pretrained_weights/minigpt4/prerained_minigpt4_7b_converted.pth 3 | llm_model_name_or_path=/scratch/PI/tongzhang/qinglian/checkpoints/pretrained_weights/vicuna-7b/ 4 | deepspeed examples/vis_chatbot.py --model_name_or_path ${model} --deepspeed configs/ds_config_multimodal.json --arch_type vision_encoder_decoder --task vqa --custom_model \ 5 | --prompt_format mini_gpt \ 6 | --prompt_structure "{input_text}###Assistant:" \ 7 | --checkpoint_path ${checkpoint_path} \ 8 | --llm_model_name_or_path ${llm_model_name_or_path} \ 9 | --low_resource True \ 10 | ${@:1} 11 | 12 | -------------------------------------------------------------------------------- /configs/ds_config_zero3_for_eval.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": true 4 | }, 5 | "zero_optimization": { 6 | "stage": 3, 7 | "offload_optimizer": { 8 | "device": "cpu", 9 | "pin_memory": true 10 | }, 11 | "offload_param": { 12 | "device": "cpu", 13 | "pin_memory": true 14 | }, 15 | "overlap_comm": true, 16 | "contiguous_gradients": true, 17 | "sub_group_size": 1e9, 18 | "reduce_bucket_size": "auto", 19 | "stage3_prefetch_bucket_size": "auto", 20 | "stage3_param_persistence_threshold": "auto", 21 | "stage3_max_live_parameters": 1e9, 22 | "stage3_max_reuse_distance": 1e9, 23 | "stage3_gather_16bit_weights_on_model_save": true 24 | }, 25 | 26 | "steps_per_print": 2000, 27 | "train_micro_batch_size_per_gpu": 1, 28 | "wall_clock_breakdown": false 29 | } 30 | -------------------------------------------------------------------------------- /scripts/.nfs0000000094418362000004c4: -------------------------------------------------------------------------------- 1 | model=Salesforce/blip2-flan-t5-xxl 2 | checkpoint_path=$1 3 | llm_model_name_or_path=$2 4 | deepspeed examples/vis_chatbot_gradio.py --model_name_or_path ${model} \ 5 | --deepspeed configs/ds_config_multimodal.json \ 6 | --arch_type vision_encoder_decoder \ 7 | --task vqa \ 8 | --custom_model \ 9 | --prompt_format mini_gpt \ 10 | --prompt_structure "{input_text}###Assistant:" \ 11 | --checkpoint_path ${checkpoint_path} \ 12 | --llm_model_name_or_path ${llm_model_name_or_path} \ 13 | --low_resource True \ 14 | ${@:3} 15 | -------------------------------------------------------------------------------- /scripts/vocab_extension/train_merge_tokenizer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # download data 4 | cd data && bash download.sh wiki_zh_eval && cd - 5 | 6 | # convert json to txt for sentencepiece 7 | python utils/convert_json_to_txt.py --dataset_path ./data/wiki_zh_eval \ 8 | --output_path ./data/wiki_zh_eval/converted_data.txt \ 9 | --overwrite True 10 | 11 | # train a new tokenizer 12 | mkdir -p ./output_models/new_tokenizer 13 | python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_data.txt \ 14 | --model_type bpe \ 15 | --output_dir ./output_models/new_tokenizer \ 16 | --user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \ 17 | --vocab_size 20000 18 | 19 | # merge the new tokenizer with the old one 20 | mkdir -p ./output_models/merged_tokenizer 21 | python utils/merge_tokenizer.py --chinese_sp_model_file ./output_models/new_tokenizer/example.model \ 22 | --tokenizer_dir pinkmanlove/llama-7b-hf \ 23 | --output_dir ./output_models/merged_tokenizer -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /src/lmflow/models/auto_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """Automatically get correct model type. 4 | """ 5 | 6 | from lmflow.models.hf_decoder_model import HFDecoderModel 7 | from lmflow.models.text_regression_model import TextRegressionModel 8 | from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel 9 | 10 | class AutoModel: 11 | 12 | @classmethod 13 | def get_model(self, model_args, *args, **kwargs): 14 | arch_type = model_args.arch_type 15 | if arch_type == "decoder_only": 16 | return HFDecoderModel(model_args, *args, **kwargs) 17 | elif arch_type == "text_regression": 18 | return TextRegressionModel(model_args, *args, **kwargs) 19 | elif arch_type == "encoder_decoder" or \ 20 | arch_type == "vision_encoder_decoder": 21 | return HFEncoderDecoderModel(model_args, *args, **kwargs) 22 | else: 23 | raise NotImplementedError( 24 | f"model architecture type \"{arch_type}\" is not supported" 25 | ) 26 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # LMFlow 2 | 3 | We welcome contributions from the open-source community with open arms! We value and appreciate all types of participation, not just code. Whether you're answering questions, offering help, improving the documentation, or simply reaching out, your contributions are immensely valuable to us. So, if you're interested, don't hesitate to get involved! 4 | 5 | To start, we encourage everyone to say hello in our public Discord channel. Here, we discuss the latest trends in Large Foundation models, showcase personal projects, help each other with contributions, or just hang out over a cup of coffee. Join us on Discord! 6 | 7 | No matter how you choose to contribute, we strive to maintain an open, welcoming, and kind community. We ask that you read our code of conduct and be respectful during your interactions. It's also essential that you become familiar with the ethical guidelines that guide our project and adhere to the same principles of transparency and responsibility. 8 | 9 | We highly value feedback from the community, so please don't hesitate to speak up if you have any valuable feedback that can help improve the library. We read and consider every message, comment, issue, and pull request (PR). 10 | -------------------------------------------------------------------------------- /configs/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | 15 | "optimizer": { 16 | "type": "AdamW", 17 | "params": { 18 | "lr": "auto", 19 | "betas": "auto", 20 | "eps": "auto", 21 | "weight_decay": "auto" 22 | } 23 | }, 24 | 25 | "zero_optimization": { 26 | "stage": 2, 27 | "offload_optimizer": { 28 | "device": "cpu", 29 | "pin_memory": true 30 | }, 31 | "allgather_partitions": true, 32 | "allgather_bucket_size": 2e8, 33 | "overlap_comm": true, 34 | "reduce_scatter": true, 35 | "reduce_bucket_size": 2e8, 36 | "contiguous_gradients": true 37 | }, 38 | 39 | "gradient_accumulation_steps": "auto", 40 | "gradient_clipping": "auto", 41 | "steps_per_print": 2000, 42 | "train_batch_size": "auto", 43 | "train_micro_batch_size_per_gpu": "auto", 44 | "wall_clock_breakdown": false 45 | } 46 | -------------------------------------------------------------------------------- /scripts/run_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Please run this script under ${project_id} in project directory of 3 | # https://github.com/shizhediao/llm-ft 4 | # COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4 5 | 6 | deepspeed_args="--master_port=11000" # Default argument 7 | if [ $# -ge 1 ]; then 8 | deepspeed_args="$1" 9 | fi 10 | 11 | exp_id=finetune 12 | project_dir=$(cd "$(dirname $0)"/..; pwd) 13 | output_dir=${project_dir}/output_models/${exp_id} 14 | log_dir=${project_dir}/log/${exp_id} 15 | 16 | dataset_path=${project_dir}/data/alpaca/train 17 | 18 | mkdir -p ${output_dir} ${log_dir} 19 | 20 | deepspeed ${deepspeed_args} \ 21 | examples/finetune.py \ 22 | --model_name_or_path gpt2 \ 23 | --dataset_path ${dataset_path} \ 24 | --output_dir ${output_dir} --overwrite_output_dir \ 25 | --num_train_epochs 0.01 \ 26 | --learning_rate 2e-5 \ 27 | --block_size 512 \ 28 | --per_device_train_batch_size 1 \ 29 | --deepspeed configs/ds_config_zero3.json \ 30 | --bf16 \ 31 | --run_name finetune \ 32 | --validation_split_percentage 0 \ 33 | --logging_steps 20 \ 34 | --do_train \ 35 | --ddp_timeout 72000 \ 36 | --save_steps 5000 \ 37 | --dataloader_num_workers 1 \ 38 | | tee ${log_dir}/train.log \ 39 | 2> ${log_dir}/train.err 40 | -------------------------------------------------------------------------------- /src/lmflow/pipeline/auto_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """Return a pipeline automatically based on its name. 4 | """ 5 | 6 | from lmflow.pipeline.evaluator import Evaluator 7 | from lmflow.pipeline.finetuner import Finetuner 8 | from lmflow.pipeline.inferencer import Inferencer 9 | from lmflow.pipeline.raft_aligner import RaftAligner 10 | 11 | 12 | PIPELINE_MAPPING = { 13 | "evaluator": Evaluator, 14 | "finetuner": Finetuner, 15 | "inferencer": Inferencer, 16 | "raft_aligner": RaftAligner, 17 | } 18 | 19 | 20 | class AutoPipeline: 21 | """ 22 | The class designed to return a pipeline automatically based on its name. 23 | """ 24 | @classmethod 25 | def get_pipeline(self, 26 | pipeline_name, 27 | model_args, 28 | data_args, 29 | pipeline_args, 30 | *args, 31 | **kwargs 32 | ): 33 | if pipeline_name not in PIPELINE_MAPPING: 34 | raise NotImplementedError( 35 | f'Pipeline "{pipeline_name}" is not supported' 36 | ) 37 | 38 | pipeline = PIPELINE_MAPPING[pipeline_name]( 39 | model_args, 40 | data_args, 41 | pipeline_args, 42 | *args, 43 | **kwargs 44 | ) 45 | return pipeline 46 | -------------------------------------------------------------------------------- /scripts/run_finetune_with_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Please run this script under ${project_id} in project directory of 3 | 4 | deepspeed_args="--master_port=11000" # Default argument 5 | if [ $# -ge 1 ]; then 6 | deepspeed_args="$1" 7 | fi 8 | 9 | exp_id=finetune_with_lora 10 | project_dir=$(cd "$(dirname $0)"/..; pwd) 11 | output_dir=${project_dir}/output_models/${exp_id} 12 | log_dir=${project_dir}/log/${exp_id} 13 | 14 | dataset_path=${project_dir}/data/alpaca/train 15 | 16 | mkdir -p ${output_dir} ${log_dir} 17 | 18 | deepspeed ${deepspeed_args} \ 19 | examples/finetune.py \ 20 | --model_name_or_path facebook/galactica-1.3b \ 21 | --dataset_path ${dataset_path} \ 22 | --output_dir ${output_dir} --overwrite_output_dir \ 23 | --num_train_epochs 0.01 \ 24 | --learning_rate 1e-4 \ 25 | --block_size 512 \ 26 | --per_device_train_batch_size 1 \ 27 | --use_lora 1 \ 28 | --lora_r 8 \ 29 | --save_aggregated_lora 0\ 30 | --deepspeed configs/ds_config_zero2.json \ 31 | --bf16 \ 32 | --run_name finetune_with_lora \ 33 | --validation_split_percentage 0 \ 34 | --logging_steps 20 \ 35 | --do_train \ 36 | --ddp_timeout 72000 \ 37 | --save_steps 5000 \ 38 | --dataloader_num_workers 1 \ 39 | | tee ${log_dir}/train.log \ 40 | 2> ${log_dir}/train.err 41 | -------------------------------------------------------------------------------- /utils/train_tokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import argparse 5 | import os 6 | import sentencepiece as spm 7 | 8 | if __name__ == '__main__': 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--dataset_path', default='./data/wiki_zh_eval/converted_data.txt', type=str, required=False) 12 | parser.add_argument('--output_dir', default='./output_models/new_tokenizer', type=str, required=False) 13 | parser.add_argument('--vocab_size', default=20000, type=int, required=False) 14 | parser.add_argument('--model_type', default='bpe', type=str, required=False) 15 | parser.add_argument('--user_defined_symbols', default='0,1,2,3,4,5,6,7,8,9,%', type=str, required=False) 16 | args = parser.parse_args() 17 | 18 | dataset_path = args.dataset_path 19 | output_dir = args.output_dir 20 | vocab_size = args.vocab_size 21 | model_type = args.model_type 22 | user_defined_symbols = args.user_defined_symbols 23 | 24 | def mkdir(path): 25 | if not os.path.exists(path): 26 | os.makedirs(path) 27 | mkdir(output_dir) 28 | 29 | spm.SentencePieceTrainer.train('--input={} --model_prefix={} --model_type={} --vocab_size={} --user_defined_symbols={} --minloglevel=1'.format(dataset_path,output_dir+'/example',model_type,vocab_size,user_defined_symbols)) -------------------------------------------------------------------------------- /utils/convert_minigpt4_checkpoints.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path as osp 3 | import torch 4 | 5 | def parse_args(): 6 | parser = argparse.ArgumentParser(description="Convert checkpoint from MiniGPT4") 7 | parser.add_argument("--model_path", type=str, help="the model path for the to convert checkpoint") 8 | parser.add_argument("--save_path", default=None, type=str, help="the save path for converted checkpoint") 9 | args = parser.parse_args() 10 | return args 11 | 12 | 13 | 14 | 15 | 16 | if __name__ == "__main__": 17 | args = parse_args() 18 | model = torch.load(args.model_path) 19 | model = model['model'] 20 | new_model = {} 21 | for key, item in model.items(): 22 | key = key.replace("Qformer", "qformer") 23 | key = key.replace("llama_proj", "language_projection") 24 | key = key.replace("llama_model.model", "language_model.model") 25 | new_model[key] = item 26 | if args.save_path is None: 27 | end_string = osp.splitext(args.model_path) 28 | save_path = osp.dirname(args.model_path) + "/" + \ 29 | osp.basename(args.model_path).replace(".pth", "") + \ 30 | "-converted" + osp.splitext(args.model_path)[-1] 31 | else: 32 | save_path = args.save_path 33 | print("save_path: {}".format(save_path)) 34 | 35 | torch.save(new_model, save_path) 36 | -------------------------------------------------------------------------------- /scripts/run_multistage_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Please run this script under ${project_id} in project directory of 3 | 4 | deepspeed_args="--master_port=11000" # Default argument 5 | if [ $# -ge 1 ]; then 6 | deepspeed_args="$1" 7 | fi 8 | 9 | exp_id=multistage_finetune 10 | project_dir=$(cd "$(dirname $0)"/..; pwd) 11 | output_dir=${project_dir}/output_models/${exp_id} 12 | log_dir=${project_dir}/log/${exp_id} 13 | dataset_path="${project_dir}/data/example_dataset/train" 14 | 15 | mkdir -p ${output_dir} ${log_dir} 16 | 17 | deepspeed ${deepspeed_args} \ 18 | examples/multistage_finetune.py \ 19 | --num_stages_per_epoch 1 \ 20 | --run_name ${exp_id} \ 21 | --model_name_or_path facebook/galactica-1.3b \ 22 | --dataset_path ${dataset_path} \ 23 | --output_dir ${output_dir} --overwrite_output_dir \ 24 | --num_train_epochs 3 \ 25 | --learning_rate 1e-3 \ 26 | --block_size 512 \ 27 | --per_device_train_batch_size 2 \ 28 | --use_lora 1 \ 29 | --lora_r 8 \ 30 | --save_aggregated_lora 1 \ 31 | --deepspeed configs/ds_config_zero2.json \ 32 | --bf16 \ 33 | --run_name finetune_with_lora \ 34 | --validation_split_percentage 0 \ 35 | --logging_steps 20 \ 36 | --do_train \ 37 | --ddp_timeout 72000 \ 38 | --save_steps 5000 \ 39 | --dataloader_num_workers 1 \ 40 | | tee ${log_dir}/train.log \ 41 | 2> ${log_dir}/train.err 42 | -------------------------------------------------------------------------------- /examples/merge_lora.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """ 5 | Merge base model and lora model into a full model. 6 | """ 7 | 8 | import sys 9 | import os 10 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0]))) 11 | 12 | from dataclasses import dataclass, field 13 | from transformers import HfArgumentParser 14 | from typing import Optional 15 | 16 | from lmflow.args import ( 17 | ModelArguments, 18 | AutoArguments, 19 | ) 20 | 21 | from lmflow.models.auto_model import AutoModel 22 | 23 | 24 | @dataclass 25 | class MergeLoraArguments: 26 | output_model_path: Optional[str] = field( 27 | default=None, 28 | metadata={ 29 | "help": "output merged full model path" 30 | }, 31 | ) 32 | 33 | 34 | def main(): 35 | parser = HfArgumentParser((ModelArguments, MergeLoraArguments)) 36 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 37 | model_args, merge_lora_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 38 | else: 39 | model_args, merge_lora_args = parser.parse_args_into_dataclasses() 40 | 41 | model_args.use_lora = True 42 | model = AutoModel.get_model(model_args) 43 | model.merge_lora_weights() 44 | model.save(merge_lora_args.output_model_path, save_full_model=True) 45 | 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /scripts/run_reward_modeling.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Please run this script under ${project_id} in project directory of 3 | # https://github.com/shizhediao/llm-ft 4 | # COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4 5 | 6 | deepspeed_args="--master_port=11000" # Default argument 7 | if [ $# -ge 1 ]; then 8 | deepspeed_args="$1" 9 | fi 10 | 11 | exp_id=rm 12 | project_dir=$(cd "$(dirname $0)"/..; pwd) 13 | output_dir=${project_dir}/output_models/${exp_id} 14 | log_dir=${project_dir}/log/${exp_id} 15 | 16 | dataset_path=${project_dir}/data/hh_rlhf/rm/hh_rlhf_rm_training.json 17 | 18 | mkdir -p ${output_dir} ${log_dir} 19 | 20 | deepspeed ${deepspeed_args} \ 21 | examples/reward_modeling.py \ 22 | --model_name_or_path gpt2 \ 23 | --dataset_path ${dataset_path} \ 24 | --output_dir ${output_dir} --overwrite_output_dir \ 25 | --num_train_epochs 1 \ 26 | --learning_rate 3e-5 \ 27 | --block_size 512 \ 28 | --per_device_train_batch_size 1 \ 29 | --per_device_eval_batch_size 1\ 30 | --deepspeed configs/ds_config_zero2.json \ 31 | --bf16 \ 32 | --run_name rm_test \ 33 | --validation_split_percentage 10 \ 34 | --logging_steps 10 \ 35 | --do_train \ 36 | --ddp_timeout 72000 \ 37 | --save_steps 999999 \ 38 | --evaluation_strategy steps\ 39 | --eval_steps 100\ 40 | --weight_decay 0.001\ 41 | --dataloader_num_workers 1 \ 42 | | tee ${log_dir}/train.log \ 43 | 2> ${log_dir}/train.err 44 | -------------------------------------------------------------------------------- /utils/convert_json_to_txt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import argparse 5 | import logging 6 | 7 | import json 8 | from pathlib import Path 9 | 10 | logging.basicConfig(level=logging.WARNING) 11 | 12 | if __name__ == '__main__': 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--dataset_path', default='./data/wiki_zh_eval', type=str, required=False) 16 | parser.add_argument('--output_path', default='./data/wiki_zh_eval/converted_data.txt', type=str, required=False) 17 | parser.add_argument('--overwrite', default=False, type=bool, required=False) 18 | args = parser.parse_args() 19 | 20 | dataset_path = args.dataset_path 21 | outputfile = args.output_path 22 | 23 | outputs_list = [] 24 | data_files = [ 25 | x.absolute().as_posix() 26 | for x in Path(dataset_path).glob("*.json") 27 | ] 28 | 29 | for file_name in data_files: 30 | with open(file_name) as fin: 31 | json_data = json.load(fin) 32 | type = json_data["type"] 33 | for line in json_data["instances"]: 34 | outputs_list.append(line["text"]) 35 | 36 | 37 | if Path(outputfile).exists() and not args.overwrite: 38 | logging.warning(f"File %s exists, will not overwrite.", outputfile) 39 | else: 40 | with open(outputfile, "w") as f: 41 | for line in outputs_list: 42 | f.write(line) 43 | 44 | -------------------------------------------------------------------------------- /docs/source/examples/checkpoints.md: -------------------------------------------------------------------------------- 1 | # Checkpoints 2 | 3 | In general, you can directly load from checkpoints by using `--model_name_or_path`. However, the LLaMA case is slightly different due to the copyright issue. 4 | 5 | 6 | ## LLaMA Checkpoint 7 | 8 | 1. First, you need to get the access of LLaMA model from [facebookresearch/llama](https://github.com/facebookresearch/llama). Download the official checkpoints and save them into `${llama-path}`. 9 | 10 | 2. Second, convert the official checkpoints `${llama-path}` to HuggingFace supported checkpoints `${llama-hf-path}` by running 11 | 12 | `python ./scripts/convert_llama_weights_to_hf.py --input_dir ${llama-path} --model_size 7B --output_dir ${llama-hf-path}/llama-7b-hf` 13 | 14 | 3. Then you are good to go by setting the checkpoint path to `${llama-hf-path}/llama-7b-hf`. Enjoy it! 15 | 16 | 4. (optional) Now you have the original llama-7b-hf pretrained model. With 17 | ```sh 18 | cd output_models && ./download.sh all && cd - 19 | ``` 20 | You can obtain the model difference finetuned by ours. By a way similar to `./scripts/run_evaluation_with_lora.sh`, 21 | ```sh 22 | CUDA_VISIBLE_DEVICES=0 \ 23 | deepspeed examples/evaluate.py \ 24 | --answer_type text \ 25 | --model_name_or_path ${llama-hf-path}/llama-7b-hf \ 26 | --lora_model_path output_models/${llama-model-diff-path} \ 27 | --dataset_path data/alpaca/test \ 28 | --prompt_structure "Input: {input}" \ 29 | --deepspeed examples/ds_config.json 30 | ``` 31 | You can now evaluate with the finetuned llama model. -------------------------------------------------------------------------------- /scripts/run_finetune_with_lora_save_aggregated_weights.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Please run this script under ${project_id} in project directory of 3 | 4 | deepspeed_args="--master_port=11000" # Default argument 5 | if [ $# -ge 1 ]; then 6 | deepspeed_args="$1" 7 | fi 8 | 9 | exp_id=finetune_with_lora 10 | project_dir=$(cd "$(dirname $0)"/..; pwd) 11 | output_dir=${project_dir}/output_models/${exp_id} 12 | log_dir=${project_dir}/log/${exp_id} 13 | 14 | dataset_path=${project_dir}/data/alpaca/train 15 | eval_dataset_path=${project_dir}/data/alpaca/test 16 | 17 | mkdir -p ${output_dir} ${log_dir} 18 | 19 | deepspeed ${deepspeed_args} \ 20 | examples/finetune.py \ 21 | --model_name_or_path facebook/galactica-1.3b \ 22 | --dataset_path ${dataset_path} \ 23 | --output_dir ${output_dir} --overwrite_output_dir \ 24 | --num_train_epochs 0.01 \ 25 | --learning_rate 1e-4 \ 26 | --block_size 512 \ 27 | --per_device_train_batch_size 1 \ 28 | --use_lora 1 \ 29 | --lora_r 8 \ 30 | --save_aggregated_lora 1\ 31 | --deepspeed configs/ds_config_zero2.json \ 32 | --bf16 \ 33 | --run_name finetune_with_lora \ 34 | --validation_split_percentage 0 \ 35 | --logging_steps 20 \ 36 | --do_train \ 37 | --do_eval \ 38 | --evaluation_strategy "steps" \ 39 | --eval_steps 1000 \ 40 | --eval_dataset_path ${eval_dataset_path} \ 41 | --ddp_timeout 72000 \ 42 | --save_steps 5000 \ 43 | --dataloader_num_workers 1 \ 44 | | tee ${log_dir}/train.log \ 45 | 2> ${log_dir}/train.err 46 | -------------------------------------------------------------------------------- /configs/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | 15 | "optimizer": { 16 | "type": "AdamW", 17 | "params": { 18 | "lr": "auto", 19 | "betas": "auto", 20 | "eps": "auto", 21 | "weight_decay": "auto" 22 | } 23 | }, 24 | 25 | "zero_optimization": { 26 | "stage": 3, 27 | "offload_optimizer": { 28 | "device": "cpu", 29 | "pin_memory": true 30 | }, 31 | "offload_param": { 32 | "device": "cpu", 33 | "pin_memory": true 34 | }, 35 | "overlap_comm": true, 36 | "contiguous_gradients": true, 37 | "sub_group_size": 1e9, 38 | "reduce_bucket_size": "auto", 39 | "stage3_prefetch_bucket_size": "auto", 40 | "stage3_param_persistence_threshold": "auto", 41 | "stage3_max_live_parameters": 1e9, 42 | "stage3_max_reuse_distance": 1e9, 43 | "stage3_gather_16bit_weights_on_model_save": true 44 | }, 45 | 46 | "gradient_accumulation_steps": "auto", 47 | "gradient_clipping": "auto", 48 | "steps_per_print": 2000, 49 | "train_batch_size": "auto", 50 | "train_micro_batch_size_per_gpu": "auto", 51 | "wall_clock_breakdown": false 52 | } 53 | -------------------------------------------------------------------------------- /scripts/run_raft_align.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Please run this script under project directory. 3 | 4 | deepspeed_args="--master_port=11110" # Default argument 5 | if [ $# -ge 1 ]; then 6 | deepspeed_args="$1" 7 | fi 8 | 9 | exp_id=raft_align 10 | project_dir=$(cd "$(dirname $0)"/..; pwd) 11 | output_dir=${project_dir}/output_models/${exp_id} 12 | log_dir=${project_dir}/log/${exp_id} 13 | 14 | mkdir -p ${output_dir} ${log_dir} 15 | 16 | export PYTHONPATH=. 17 | deepspeed ${deepspeed_args} \ 18 | examples/raft_align.py \ 19 | --model_name_or_path gpt2 \ 20 | --num_raft_iteration 20 \ 21 | --learning_rate 2e-5 \ 22 | --lr_scheduler_type "constant" \ 23 | --bf16 \ 24 | --deepspeed configs/ds_config_zero2.json \ 25 | --dataset_path ${project_dir}/data/hh_rlhf/rlhf_prompt \ 26 | --output_reward_path ${project_dir}/tmp/raft_aligner/reward.txt \ 27 | --output_dir ${output_dir} --overwrite_output_dir \ 28 | --run_name ${exp_id} \ 29 | --num_train_epochs 4 \ 30 | --per_device_train_batch_size 1 \ 31 | --per_device_eval_batch_size 1 \ 32 | --validation_split_percentage 0 \ 33 | --logging_steps 1 \ 34 | --do_train \ 35 | --ddp_timeout 72000 \ 36 | --save_steps 7777 \ 37 | --dataloader_num_workers 1 \ 38 | --preprocessing_num_workers 12 \ 39 | --inference_batch_size_per_device 1 \ 40 | --collection_strategy "local" \ 41 | --raft_batch_size 1024 \ 42 | --output_min_length 96 \ 43 | --top_reward_percentage 0.125 \ 44 | | tee ${log_dir}/raft_align.log \ 45 | 2> ${log_dir}/raft_align.err 46 | -------------------------------------------------------------------------------- /src/lmflow/models/text_regression_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """ 4 | A model maps "text_only" data to float. 5 | """ 6 | 7 | from lmflow.models.regression_model import RegressionModel 8 | from lmflow.datasets.dataset import Dataset 9 | 10 | 11 | class TextRegressionModel(RegressionModel): 12 | r""" 13 | Initializes a TextRegressionModel instance. 14 | 15 | Parameters 16 | ------------ 17 | 18 | model_args : 19 | Model arguments such as model name, path, revision, etc. 20 | 21 | args : Optional. 22 | Positional arguments. 23 | 24 | kwargs : Optional. 25 | Keyword arguments. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | model_args, 31 | *args, 32 | **kwargs 33 | ): 34 | """ 35 | Initializes a TextRegressionModel instance. 36 | :param model_args: dictionary with model arguments such as model name, path, revision, etc. 37 | """ 38 | self.inference_func = None 39 | 40 | 41 | def register_inference_function(self, inference_func): 42 | """ 43 | Registers a regression function. 44 | """ 45 | self.inference_func = inference_func 46 | 47 | 48 | def inference(self, inputs: Dataset): 49 | """ 50 | Gets regression results of a given dataset. 51 | 52 | :inputs: Dataset object, only accept type "text_only". 53 | """ 54 | if self.inference_func is not None: 55 | return self.inference_func(inputs) 56 | else: 57 | pass 58 | -------------------------------------------------------------------------------- /scripts/data_preprocess/run_data_preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Run this shell script under project directory 3 | 4 | # For sample.py 5 | python scripts/data_preprocess/sample.py \ 6 | --dataset_path ./data/example_dataset/train/train_50.json \ 7 | --output_path ./data/example_dataset/train/train_50_sample.json \ 8 | --ratio 0.5 9 | 10 | # For shuffle.py 11 | python scripts/data_preprocess/shuffle.py \ 12 | --dataset_path ./data/example_dataset/train/train_50_sample.json \ 13 | --output_path ./data/example_dataset/train/train_50_sample_shuffle.json 14 | 15 | # For merge.py : you can specify multiple files to merge 16 | python scripts/data_preprocess/merge.py \ 17 | --dataset_path ./data/example_dataset/train/train_50.json \ 18 | --merge_from_path ./data/example_dataset/train/train_50_sample_shuffle.json \ 19 | ./data/example_dataset/train/train_50_sample.json \ 20 | --output_path ./data/example_dataset/train/train_merge.json \ 21 | 22 | # For concat.py: if you simply want to merge multiple files or a directory, use following. 23 | # You can also specify multiple files after --merge_from_path 24 | python scripts/data_preprocess/concat.py \ 25 | --merge_from_path ./data/example_dataset/train/*.json \ 26 | --output_path ./data/example_dataset/train/train_merge.json \ 27 | 28 | # For concat_shuffle_split.py: if you simply want to merge multiple files or a directory, use following. 29 | python scripts/data_preprocess/concat_shuffle_split.py \ 30 | --merge_from_path ./data/example_dataset/train/*.json \ 31 | --output_path ./data/processed_dataset/ \ -------------------------------------------------------------------------------- /scripts/run_all_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | help_message="./$(basename $0)" 4 | help_message+=" --model_name_or_path MODEL_NAME_OR_PATH" 5 | 6 | if [ $# -ge 1 ]; then 7 | extra_args="$@" 8 | fi 9 | 10 | model_name_or_path="" 11 | while [[ $# -ge 1 ]]; do 12 | key="$1" 13 | case ${key} in 14 | -h|--help) 15 | printf "${help_message}" 1>&2 16 | return 0 17 | ;; 18 | --model_name_or_path) 19 | model_name_or_path="$2" 20 | shift 21 | ;; 22 | *) 23 | # Ignores unknown options 24 | esac 25 | shift 26 | done 27 | 28 | model_name=$(echo "${model_name_or_path}" | sed "s/\//--/g") 29 | echo ${model_name} 30 | 31 | if [[ "${model_name}" = "" ]]; then 32 | echo "no model name specified" 1>&2 33 | exit 1 34 | fi 35 | 36 | log_dir=output_dir/${model_name}_lmflow_chat_nll_eval 37 | mkdir -p ${log_dir} 38 | echo "[Evaluating] Evaluate on LMFlow_chat" 39 | ./scripts/run_benchmark.sh ${extra_args} --dataset_name lmflow_chat_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err 40 | 41 | log_dir=output_dir/${model_name}_all_nll_eval 42 | mkdir -p ${log_dir} 43 | echo "[Evaluating] Evaluate on [commonsense, wiki, instruction_following (gpt4) ] nll evaluation" 44 | ./scripts/run_benchmark.sh ${extra_args} --dataset_name all_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err 45 | 46 | log_dir=output_dir/${model_name}_commonsense_qa_eval 47 | mkdir -p ${log_dir} 48 | echo "[Evaluating] Evaluate on commonsense QA Accuracy evaluation" 49 | ./scripts/run_benchmark.sh ${extra_args} --dataset_name commonsense_qa_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err -------------------------------------------------------------------------------- /scripts/data_preprocess/count.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """ 5 | Counts number of instances in a dataset. 6 | """ 7 | from __future__ import absolute_import 8 | 9 | import argparse 10 | import json 11 | import random 12 | import sys 13 | import textwrap 14 | 15 | def parse_argument(sys_argv): 16 | """Parses arguments from command line. 17 | Args: 18 | sys_argv: the list of arguments (strings) from command line. 19 | Returns: 20 | A struct whose member corresponds to the required (optional) variable. 21 | For example, 22 | ``` 23 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10']) 24 | args.input # 'a.txt' 25 | args.num # 10 26 | ``` 27 | """ 28 | parser = argparse.ArgumentParser( 29 | formatter_class=argparse.RawTextHelpFormatter) 30 | 31 | # Training parameters 32 | parser.add_argument( 33 | "--dataset_path", type=str, 34 | default=None, 35 | help="input dataset path, reads from stdin by default" 36 | ) 37 | 38 | # Parses from commandline 39 | args = parser.parse_args(sys_argv[1:]) 40 | 41 | return args 42 | 43 | 44 | def main(): 45 | args = parse_argument(sys.argv) 46 | if args.dataset_path is not None: 47 | with open(args.dataset_path, "r") as fin: 48 | data_dict = json.load(fin) 49 | else: 50 | data_dict = json.load(sys.stdin) 51 | 52 | num_instances = len(data_dict["instances"]) 53 | print(num_instances) 54 | 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import find_packages 3 | from setuptools import setup 4 | import subprocess 5 | 6 | folder = os.path.dirname(__file__) 7 | version_path = os.path.join(folder, "src", "lmflow", "version.py") 8 | 9 | __version__ = None 10 | with open(version_path) as f: 11 | exec(f.read(), globals()) 12 | 13 | req_path = os.path.join(folder, "requirements.txt") 14 | install_requires = [] 15 | if os.path.exists(req_path): 16 | with open(req_path) as fp: 17 | install_requires = [line.strip() for line in fp] 18 | 19 | readme_path = os.path.join(folder, "README.md") 20 | readme_contents = "" 21 | if os.path.exists(readme_path): 22 | with open(readme_path, encoding='utf-8') as fp: 23 | readme_contents = fp.read().strip() 24 | 25 | setup( 26 | name="lmflow", 27 | version=__version__, 28 | description="LMFlow: Large Model Flow.", 29 | author="The LMFlow Team", 30 | long_description=readme_contents, 31 | long_description_content_type="text/markdown", 32 | package_dir={"": "src"}, 33 | packages=find_packages("src"), 34 | package_data={}, 35 | install_requires=install_requires, 36 | classifiers=[ 37 | "Intended Audience :: Science/Research/Engineering", 38 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 39 | "Programming Language :: Python :: 3.9", 40 | "Programming Language :: Python :: 3.10", 41 | ], 42 | requires_python=">=3.9", 43 | ) 44 | 45 | # Must be called after all dependency installed, since flash-attn setup.py 46 | # relies on torch, packaging, etc. 47 | try: 48 | gpu_state = subprocess.check_output(["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"]) 49 | if b"A100" in gpu_state: 50 | subprocess.call(["pip", "install", "flash-attn==1.0.4"]) 51 | except: 52 | pass 53 | -------------------------------------------------------------------------------- /tests/models/test_auto_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from lmflow.args import ModelArguments 4 | from lmflow.models.auto_model import AutoModel 5 | from lmflow.models.hf_decoder_model import HFDecoderModel 6 | from lmflow.models.text_regression_model import TextRegressionModel 7 | from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel 8 | 9 | MODEL_NAME = "gpt2" 10 | 11 | 12 | class AutoModelTest(unittest.TestCase): 13 | 14 | def test_get_decoder_model(self): 15 | model_args = ModelArguments( 16 | arch_type="decoder_only", model_name_or_path=MODEL_NAME) 17 | model = AutoModel.get_model(model_args) 18 | self.assertTrue(isinstance(model, HFDecoderModel)) 19 | 20 | 21 | # This unit test is commented out since the encoder decoder model has not been fully implemented 22 | ''' 23 | def test_get_text_regression_model(self): 24 | model_args = ModelArguments( 25 | arch_type="text_regression", model_name_or_path=MODEL_NAME) 26 | model = AutoModel.get_model(model_args) 27 | self.assertTrue(isinstance(model, TextRegressionModel)) 28 | ''' 29 | 30 | 31 | # This unit test is commented out since the encoder decoder model has not been fully implemented 32 | ''' 33 | def test_get_encoder_decoder(self): 34 | model_args = ModelArguments( 35 | arch_type="encoder_decoder", model_name_or_path=MODEL_NAME) 36 | model = AutoModel.get_model(model_args) 37 | self.assertTrue(isinstance(model, HFEncoderDecoderModel)) 38 | ''' 39 | 40 | 41 | def test_get_unsupported_model(self): 42 | model_args = ModelArguments( 43 | arch_type="unsupported model", model_name_or_path=MODEL_NAME) 44 | with self.assertRaises(NotImplementedError): 45 | model = AutoModel.get_model(model_args) 46 | -------------------------------------------------------------------------------- /examples/evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """A one-line summary of the module or program, terminated by a period. 5 | 6 | Leave one blank line. The rest of this docstring should contain an 7 | overall description of the module or program. Optionally, it may also 8 | contain a brief description of exported classes and functions and/or usage 9 | examples. 10 | 11 | Typical usage example: 12 | 13 | foo = ClassFoo() 14 | bar = foo.FunctionBar() 15 | """ 16 | import json 17 | import os 18 | import sys 19 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0]))) 20 | from transformers import HfArgumentParser 21 | 22 | from lmflow.datasets.dataset import Dataset 23 | from lmflow.pipeline.auto_pipeline import AutoPipeline 24 | from lmflow.models.auto_model import AutoModel 25 | from lmflow.args import ModelArguments, DatasetArguments, AutoArguments 26 | 27 | 28 | pipeline_name = "evaluator" 29 | PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name) 30 | 31 | parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments)) 32 | model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses() 33 | 34 | with open (pipeline_args.deepspeed, "r") as f: 35 | ds_config = json.load(f) 36 | 37 | model = AutoModel.get_model( 38 | model_args, 39 | tune_strategy='none', 40 | ds_config=ds_config, 41 | use_accelerator=pipeline_args.use_accelerator_for_evaluator 42 | ) 43 | dataset = Dataset(data_args) 44 | 45 | evaluator = AutoPipeline.get_pipeline( 46 | pipeline_name=pipeline_name, 47 | model_args=model_args, 48 | data_args=data_args, 49 | pipeline_args=pipeline_args, 50 | ) 51 | evaluator.evaluate(model=model, dataset=dataset, metric=pipeline_args.metric) 52 | -------------------------------------------------------------------------------- /docs/source/examples/medical_finetune.md: -------------------------------------------------------------------------------- 1 | # Finetune 2 | 3 | ```python 4 | import sys 5 | 6 | from transformers import HfArgumentParser 7 | 8 | from lmflow.args import ( 9 | ModelArguments, 10 | DatasetArguments, 11 | AutoArguments, 12 | ) 13 | 14 | from lmflow.datasets.dataset import Dataset 15 | from lmflow.models.tunable_models import TunableModel 16 | from lmflow.pipeline.auto_pipeline import AutoPipeline 17 | 18 | 19 | def main(): 20 | # Parses arguments 21 | pipeline_name = "finetuner" 22 | PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name) 23 | 24 | parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments)) 25 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 26 | # If we pass only one argument to the script and it's the path to a json file, 27 | # let's parse it to get our arguments. 28 | model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 29 | else: 30 | model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses() 31 | 32 | # TODO: deepspeed config initialization 33 | 34 | # Initialization 35 | finetuner = AutoPipeline.get_pipeline( 36 | pipeline_name=pipeline_name, 37 | model_args=model_args, 38 | data_args=data_args, 39 | pipeline_args=pipeline_args, 40 | ) 41 | dataset = Dataset(data_args) 42 | model = TunableModel(model_args) 43 | 44 | # Tokenization and text grouping must be done in the main process 45 | with pipeline_args.main_process_first(desc="dataset map tokenization"): 46 | tokenized_dataset = model.tokenize(dataset) 47 | lm_dataset = finetuner.group_text( 48 | tokenized_dataset, 49 | model_max_length=model.get_max_length(), 50 | ) 51 | 52 | # Finetuning 53 | tuned_model = finetuner.tune(model=model, lm_dataset=lm_dataset) 54 | 55 | ``` 56 | -------------------------------------------------------------------------------- /utils/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Make the delta weights by subtracting base weights. 3 | 4 | Usage: 5 | python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1 6 | """ 7 | import argparse 8 | 9 | import torch 10 | from tqdm import tqdm 11 | from transformers import AutoTokenizer, AutoModelForCausalLM 12 | 13 | 14 | def make_delta(base_model_path, target_model_path, delta_path): 15 | print(f"Loading the base model from {base_model_path}") 16 | base = AutoModelForCausalLM.from_pretrained( 17 | base_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True 18 | ) 19 | 20 | print(f"Loading the target model from {target_model_path}") 21 | target = AutoModelForCausalLM.from_pretrained( 22 | target_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True 23 | ) 24 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False) 25 | 26 | print("Calculating the delta") 27 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 28 | assert name in base.state_dict() 29 | param.data -= base.state_dict()[name] 30 | 31 | print(f"Saving the delta to {delta_path}") 32 | if args.hub_repo_id: 33 | kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id} 34 | else: 35 | kwargs = {} 36 | target.save_pretrained(delta_path, **kwargs) 37 | target_tokenizer.save_pretrained(delta_path, **kwargs) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | parser.add_argument("--hub-repo-id", type=str) 46 | args = parser.parse_args() 47 | 48 | make_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /scripts/run_vis_chatbot_gradio_minigpt4.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model=Salesforce/blip2-flan-t5-xxl 4 | 5 | # if [ ! -f output_models/pretrained_minigpt4_7b.pth ]; then 6 | # cd output_models && ./download.sh minigpt4_7b && cd - 7 | # fi 8 | # 9 | # if [ ! -f output_models/pretrained_minigpt4_7b_converted.pth ]; then 10 | # python utils/convert_minigpt4_checkpoints.py \ 11 | # --model_path output_models/pretrained_minigpt4_7b.pth \ 12 | # --save_path output_models/pretrained_minigpt4_7b_converted.pth 13 | # fi 14 | # 15 | # deepspeed --master_port=11005 examples/vis_chatbot_gradio.py \ 16 | # --model_name_or_path ${model} \ 17 | # --deepspeed configs/ds_config_multimodal.json \ 18 | # --arch_type vision_encoder_decoder \ 19 | # --task vqa \ 20 | # --custom_model \ 21 | # --prompt_format mini_gpt \ 22 | # --prompt_structure "###Human: {input_text}###Assistant:" \ 23 | # --llm_model_name_or_path LMFlow/Full-Robin-7b-v2 \ 24 | # --checkpoint_path output_models/pretrained_minigpt4_7b_converted.pth \ 25 | # --low_resource True \ 26 | # --max_new_tokens 1024 27 | 28 | if [ ! -f output_models/pretrained_minigpt4_13b.pth ]; then 29 | cd output_models && ./download.sh minigpt4_13b && cd - 30 | fi 31 | 32 | if [ ! -f output_models/pretrained_minigpt4_13b_converted.pth ]; then 33 | python utils/convert_minigpt4_checkpoints.py \ 34 | --model_path output_models/pretrained_minigpt4_13b.pth \ 35 | --save_path output_models/pretrained_minigpt4_13b_converted.pth 36 | fi 37 | 38 | deepspeed --master_port=11005 examples/vis_chatbot_gradio.py \ 39 | --model_name_or_path ${model} \ 40 | --deepspeed configs/ds_config_multimodal.json \ 41 | --arch_type vision_encoder_decoder \ 42 | --task vqa \ 43 | --custom_model \ 44 | --prompt_format mini_gpt \ 45 | --prompt_structure "###Human: {input_text}###Assistant:" \ 46 | --llm_model_name_or_path LMFlow/Full-Robin-13b-v2 \ 47 | --checkpoint_path output_models/pretrained_minigpt4_13b_converted.pth \ 48 | --low_resource True \ 49 | --max_new_tokens 1024 50 | -------------------------------------------------------------------------------- /examples/finetune.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """A one-line summary of the module or program, terminated by a period. 5 | 6 | Leave one blank line. The rest of this docstring should contain an 7 | overall description of the module or program. Optionally, it may also 8 | contain a brief description of exported classes and functions and/or usage 9 | examples. 10 | 11 | Typical usage example: 12 | 13 | foo = ClassFoo() 14 | bar = foo.FunctionBar() 15 | """ 16 | 17 | import sys 18 | import os 19 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0]))) 20 | from transformers import HfArgumentParser 21 | 22 | from lmflow.args import ( 23 | ModelArguments, 24 | DatasetArguments, 25 | AutoArguments, 26 | ) 27 | 28 | from lmflow.datasets.dataset import Dataset 29 | from lmflow.models.auto_model import AutoModel 30 | from lmflow.pipeline.auto_pipeline import AutoPipeline 31 | 32 | 33 | def main(): 34 | # Parses arguments 35 | pipeline_name = "finetuner" 36 | PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name) 37 | 38 | parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments)) 39 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 40 | # If we pass only one argument to the script and it's the path to a json file, 41 | # let's parse it to get our arguments. 42 | model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 43 | else: 44 | model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses() 45 | 46 | # Initialization 47 | finetuner = AutoPipeline.get_pipeline( 48 | pipeline_name=pipeline_name, 49 | model_args=model_args, 50 | data_args=data_args, 51 | pipeline_args=pipeline_args, 52 | ) 53 | dataset = Dataset(data_args) 54 | model = AutoModel.get_model(model_args) 55 | 56 | # Finetuning 57 | tuned_model = finetuner.tune(model=model, dataset=dataset) 58 | 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /tests/datasets/test_dataset.py: -------------------------------------------------------------------------------- 1 | #!/bin/env/python3 2 | # coding=utf-8 3 | """A one-line summary of the module or program, terminated by a period. 4 | 5 | Leave one blank line. The rest of this docstring should contain an 6 | overall description of the module or program. Optionally, it may also 7 | contain a brief description of exported classes and functions and/or usage 8 | examples. 9 | 10 | Typical usage example: 11 | 12 | foo = ClassFoo() 13 | bar = foo.FunctionBar() 14 | """ 15 | from __future__ import absolute_import 16 | import unittest 17 | 18 | import json 19 | import os 20 | from pathlib import Path 21 | 22 | from lmflow.args import DatasetArguments 23 | from lmflow.datasets.dataset import Dataset 24 | 25 | 26 | class DatasetTest(unittest.TestCase): 27 | 28 | def test_init(self): 29 | dataset_dir = 'data/example_dataset/train' 30 | data_args = DatasetArguments( 31 | dataset_path=dataset_dir 32 | ) 33 | dataset = Dataset(data_args, backend='huggingface') 34 | hf_dataset = dataset.get_backend_dataset() 35 | 36 | with open(os.path.join(Path(dataset_dir), 'train_50.json'), 'r') as fin: 37 | json_obj = json.load(fin) 38 | for i in range(len(hf_dataset)): 39 | self.assertEqual(json_obj['instances'][i], hf_dataset[i]) 40 | 41 | 42 | def test_create_from_dict(self): 43 | data_dict = { 44 | "type": "text2text", 45 | "instances": [ 46 | { "input": "INPUT 1", "output": "OUTPUT 1" }, 47 | { "input": "INPUT 2", "output": "OUTPUT 2" }, 48 | ] 49 | } 50 | dataset = Dataset.create_from_dict(data_dict) 51 | self.assertEqual(dataset.to_dict(), data_dict) 52 | 53 | 54 | def test_create_from_dict_bad_type(self): 55 | data_dict = { 56 | "type": "non-supported", 57 | "instances": [ 58 | { "input": "INPUT 1", "output": "OUTPUT 1" }, 59 | { "input": "INPUT 2", "output": "OUTPUT 2" }, 60 | ] 61 | } 62 | with self.assertRaises(ValueError): 63 | dataset = Dataset.create_from_dict(data_dict) 64 | -------------------------------------------------------------------------------- /docs/source/_static/logo5.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experimental/RAFT-diffusion/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.18.0 2 | asttokens==2.2.1 3 | backcall==0.2.0 4 | bitsandbytes==0.37.2 5 | certifi==2022.12.7 6 | charset-normalizer==3.1.0 7 | clip==1.0== 8 | cmake==3.26.1 9 | comm==0.1.3 10 | contourpy==1.0.7 11 | cycler==0.11.0 12 | debugpy==1.6.7 13 | decorator==5.1.1 14 | diffusers==0.14.0 15 | executing==1.2.0 16 | filelock==3.11.0 17 | fonttools==4.39.3 18 | ftfy==6.1.1 19 | huggingface-hub==0.13.4 20 | idna==3.4 21 | importlib-metadata==6.2.0 22 | importlib-resources==5.12.0 23 | ipykernel==6.22.0 24 | ipython==8.12.0 25 | jedi==0.18.2 26 | Jinja2==3.1.2 27 | jupyter_client==8.1.0 28 | jupyter_core==5.3.0 29 | kiwisolver==1.4.4 30 | lit==16.0.0 31 | MarkupSafe==2.1.2 32 | matplotlib==3.7.1 33 | matplotlib-inline==0.1.6 34 | mpmath==1.3.0 35 | mypy-extensions==1.0.0 36 | nest-asyncio==1.5.6 37 | networkx==3.1 38 | numpy==1.24.2 39 | nvidia-cublas-cu11==11.10.3.66 40 | nvidia-cuda-cupti-cu11==11.7.101 41 | nvidia-cuda-nvrtc-cu11==11.7.99 42 | nvidia-cuda-runtime-cu11==11.7.99 43 | nvidia-cudnn-cu11==8.5.0.96 44 | nvidia-cufft-cu11==10.9.0.58 45 | nvidia-curand-cu11==10.2.10.91 46 | nvidia-cusolver-cu11==11.4.0.1 47 | nvidia-cusparse-cu11==11.7.4.91 48 | nvidia-nccl-cu11==2.14.3 49 | nvidia-nvtx-cu11==11.7.91 50 | open-clip-torch==2.16.0 51 | packaging==23.0 52 | pandas==2.0.0 53 | parso==0.8.3 54 | pexpect==4.8.0 55 | pickleshare==0.7.5 56 | Pillow==9.5.0 57 | pip==23.0.1 58 | platformdirs==3.2.0 59 | prompt-toolkit==3.0.38 60 | protobuf==3.20.3 61 | psutil==5.9.4 62 | ptyprocess==0.7.0 63 | pure-eval==0.2.2 64 | Pygments==2.14.0 65 | pyparsing==3.0.9 66 | pyre-extensions==0.0.23 67 | python-dateutil==2.8.2 68 | pytz==2023.3 69 | PyYAML==6.0 70 | pyzmq==25.0.2 71 | regex==2023.3.23 72 | requests==2.28.2 73 | sentencepiece==0.1.97 74 | setuptools==65.6.3 75 | six==1.16.0 76 | stack-data==0.6.2 77 | sympy==1.11.1 78 | timm==0.6.13 79 | tokenizers==0.13.3 80 | torch==2.0.0 81 | torchvision==0.15.1 82 | tornado==6.2 83 | tqdm==4.65.0 84 | traitlets==5.9.0 85 | transformers==4.27.4 86 | triton==2.0.0 87 | typing_extensions==4.5.0 88 | typing-inspect==0.8.0 89 | tzdata==2023.3 90 | urllib3==1.26.15 91 | wcwidth==0.2.6 92 | wheel==0.38.4 93 | xformers==0.0.18 94 | zipp==3.15.0 95 | 96 | -------------------------------------------------------------------------------- /scripts/data_preprocess/shuffle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """ 5 | Samples a certain ratio of instances from a dataset. 6 | """ 7 | from __future__ import absolute_import 8 | 9 | import argparse 10 | import json 11 | import random 12 | import sys 13 | import textwrap 14 | 15 | def parse_argument(sys_argv): 16 | """Parses arguments from command line. 17 | Args: 18 | sys_argv: the list of arguments (strings) from command line. 19 | Returns: 20 | A struct whose member corresponds to the required (optional) variable. 21 | For example, 22 | ``` 23 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10']) 24 | args.input # 'a.txt' 25 | args.num # 10 26 | ``` 27 | """ 28 | parser = argparse.ArgumentParser( 29 | formatter_class=argparse.RawTextHelpFormatter) 30 | 31 | # Training parameters 32 | parser.add_argument( 33 | "--dataset_path", type=str, 34 | default=None, 35 | help="input dataset path, reads from stdin by default" 36 | ) 37 | parser.add_argument( 38 | "--output_path", type=str, 39 | default=None, 40 | help="output dataset path, writes to stdout by default" 41 | ) 42 | parser.add_argument( 43 | "--seed", type=int, default=42, 44 | help="pseudorandom seed" 45 | ) 46 | 47 | # Parses from commandline 48 | args = parser.parse_args(sys_argv[1:]) 49 | 50 | return args 51 | 52 | 53 | def main(): 54 | args = parse_argument(sys.argv) 55 | if args.dataset_path is not None: 56 | with open(args.dataset_path, "r") as fin: 57 | data_dict = json.load(fin) 58 | else: 59 | data_dict = json.load(sys.stdin) 60 | 61 | random.seed(args.seed) 62 | random.shuffle(data_dict["instances"]) 63 | 64 | if args.output_path is not None: 65 | with open(args.output_path, "w") as fout: 66 | json.dump(data_dict, fout, indent=4, ensure_ascii=False) 67 | else: 68 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False) 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = 'LMFlow' 10 | copyright = 'LMFlow 2023' 11 | author = 'The LMFlow Team' 12 | 13 | import sys 14 | import os 15 | sys.path.insert(0,os.path.abspath('../..')) 16 | 17 | 18 | # -- General configuration --------------------------------------------------- 19 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 20 | 21 | extensions = [] 22 | 23 | templates_path = ['_templates'] 24 | exclude_patterns = [] 25 | 26 | extensions = [ 27 | "sphinx.ext.autodoc", 28 | "sphinx.ext.autosummary", 29 | "sphinx.ext.todo", 30 | "sphinx.ext.viewcode", 31 | 'myst_parser', 32 | 'autoapi.extension', 33 | #"sphinxext.rediraffe", 34 | "sphinx_design", 35 | #"sphinx_copybutton", 36 | # For extension examples and demos 37 | #"ablog", 38 | "matplotlib.sphinxext.plot_directive", 39 | #"myst_nb", 40 | # "nbsphinx", # Uncomment and comment-out MyST-NB for local testing purposes. 41 | "numpydoc", 42 | #"sphinx_togglebutton", 43 | #"sphinx_favicon", 44 | ] 45 | 46 | autosummary_generate = True 47 | 48 | autoapi_type = 'python' 49 | autoapi_dirs = ['../../src'] 50 | 51 | html_theme_options = { 52 | "header_links_before_dropdown": 4, 53 | "icon_links": [ 54 | { 55 | "name": "LMFlow", 56 | "url": "https://github.com/OptimalScale/LMFlow", 57 | "icon": "_static/logo5.svg", 58 | "type": "local", 59 | "attributes": {"target": "_blank"}, 60 | }, 61 | ], 62 | "logo": { 63 | "text": "LMFlow", 64 | "image_dark": "_static/logo5.svg", 65 | "alt_text": "LMFlow", 66 | }, 67 | } 68 | 69 | 70 | # -- Options for HTML output ------------------------------------------------- 71 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 72 | 73 | html_theme = 'pydata_sphinx_theme' 74 | html_static_path = ['_static'] 75 | -------------------------------------------------------------------------------- /scripts/data_preprocess/sample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """ 5 | Samples a certain ratio of instances from a dataset. 6 | """ 7 | from __future__ import absolute_import 8 | 9 | import argparse 10 | import json 11 | import random 12 | import sys 13 | import textwrap 14 | 15 | def parse_argument(sys_argv): 16 | """Parses arguments from command line. 17 | Args: 18 | sys_argv: the list of arguments (strings) from command line. 19 | Returns: 20 | A struct whose member corresponds to the required (optional) variable. 21 | For example, 22 | ``` 23 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10']) 24 | args.input # 'a.txt' 25 | args.num # 10 26 | ``` 27 | """ 28 | parser = argparse.ArgumentParser( 29 | formatter_class=argparse.RawTextHelpFormatter) 30 | 31 | # Training parameters 32 | parser.add_argument( 33 | "--dataset_path", type=str, 34 | default=None, 35 | help="input dataset path, reads from stdin by default" 36 | ) 37 | parser.add_argument( 38 | "--output_path", type=str, 39 | default=None, 40 | help="output dataset path, writes to stdout by default" 41 | ) 42 | parser.add_argument( 43 | "--ratio", type=float, required=True, 44 | help="sample ratio, will be floored if number of samples is not a int" 45 | ) 46 | parser.add_argument( 47 | "--seed", type=int, default=42, 48 | help="pseudorandom seed" 49 | ) 50 | 51 | # Parses from commandline 52 | args = parser.parse_args(sys_argv[1:]) 53 | 54 | return args 55 | 56 | 57 | def main(): 58 | args = parse_argument(sys.argv) 59 | if args.dataset_path is not None: 60 | with open(args.dataset_path, "r") as fin: 61 | data_dict = json.load(fin) 62 | else: 63 | data_dict = json.load(sys.stdin) 64 | 65 | random.seed(args.seed) 66 | num_instances = len(data_dict["instances"]) 67 | num_sample = int(num_instances * args.ratio) 68 | 69 | data_dict["instances"] = random.sample(data_dict["instances"], num_sample) 70 | 71 | if args.output_path is not None: 72 | with open(args.output_path, "w") as fout: 73 | json.dump(data_dict, fout, indent=4, ensure_ascii=False) 74 | else: 75 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /docs/source/_static/logo4.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/data_preprocess/add_prompt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """ 5 | Adds prompt structure to a text2text dataset. 6 | """ 7 | from __future__ import absolute_import 8 | 9 | import argparse 10 | import json 11 | import textwrap 12 | import sys 13 | 14 | def parse_argument(sys_argv): 15 | """Parses arguments from command line. 16 | Args: 17 | sys_argv: the list of arguments (strings) from command line. 18 | Returns: 19 | A struct whose member corresponds to the required (optional) variable. 20 | For example, 21 | ``` 22 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10']) 23 | args.input # 'a.txt' 24 | args.num # 10 25 | ``` 26 | """ 27 | parser = argparse.ArgumentParser( 28 | formatter_class=argparse.RawTextHelpFormatter) 29 | 30 | # Training parameters 31 | parser.add_argument( 32 | "--dataset_path", type=str, 33 | default=None, 34 | help=textwrap.dedent("input dataset path, reads from stdin by default") 35 | ) 36 | parser.add_argument( 37 | "--output_path", type=str, 38 | default=None, 39 | help=textwrap.dedent("output dataset path, writes to stdout by default") 40 | ) 41 | parser.add_argument( 42 | "--prompt_structure", type=str, 43 | default="{input}", 44 | help=textwrap.dedent("prompt structure to augment input") 45 | ) 46 | 47 | # Parses from commandline 48 | args = parser.parse_args(sys_argv[1:]) 49 | 50 | return args 51 | 52 | 53 | def main(): 54 | args = parse_argument(sys.argv) 55 | if args.dataset_path is not None: 56 | with open(args.dataset_path, "r") as fin: 57 | data_dict = json.load(fin) 58 | else: 59 | data_dict = json.load(sys.stdin) 60 | 61 | if data_dict["type"] != "text2text": 62 | raise NotImplementedError( 63 | "only support text2text prompt augmentation" 64 | ) 65 | 66 | data_dict["instances"] = [ 67 | { 68 | "input": args.prompt_structure.format(input=instance["input"]), 69 | "output": instance["output"], 70 | } 71 | for instance in data_dict["instances"] 72 | ] 73 | if args.output_path is not None: 74 | with open(args.output_path, "w") as fout: 75 | json.dump(data_dict, fout, indent=4, ensure_ascii=False) 76 | else: 77 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /scripts/data_preprocess/concat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """ 5 | Merges an extra dataset into current dataset. 6 | """ 7 | from __future__ import absolute_import 8 | 9 | import argparse 10 | import json 11 | import textwrap 12 | import sys 13 | 14 | def parse_argument(sys_argv): 15 | """Parses arguments from command line. 16 | Args: 17 | sys_argv: the list of arguments (strings) from command line. 18 | Returns: 19 | A struct whose member corresponds to the required (optional) variable. 20 | For example, 21 | ``` 22 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10']) 23 | args.input # 'a.txt' 24 | args.num # 10 25 | ``` 26 | """ 27 | parser = argparse.ArgumentParser( 28 | formatter_class=argparse.RawTextHelpFormatter) 29 | 30 | # Training parameters 31 | parser.add_argument( 32 | "--output_path", type=str, 33 | default=None, 34 | help=textwrap.dedent("output dataset path, writes to stdout by default") 35 | ) 36 | parser.add_argument( 37 | "--merge_from_path", type=str, 38 | nargs="+", 39 | help=textwrap.dedent( 40 | "dataset path of the extra dataset that will be merged" 41 | " into input dataset" 42 | ) 43 | ) 44 | 45 | # Parses from commandline 46 | args = parser.parse_args(sys_argv[1:]) 47 | 48 | return args 49 | 50 | 51 | def main(): 52 | args = parse_argument(sys.argv) 53 | 54 | if args.merge_from_path is not None: 55 | for i in range(0, len(args.merge_from_path)): 56 | with open(args.merge_from_path[i], "r") as fin: 57 | extra_data_dict = json.load(fin) 58 | if i == 0: 59 | data_dict = extra_data_dict 60 | else: 61 | if data_dict["type"] != extra_data_dict["type"]: 62 | raise ValueError( 63 | 'two dataset have different types:' 64 | f' input dataset: "{data_dict["type"]}";' 65 | f' merge from dataset: "{extra_data_dict["type"]}"' 66 | ) 67 | data_dict["instances"].extend(extra_data_dict["instances"]) 68 | else: 69 | raise ValueError("No merge files specified") 70 | 71 | if args.output_path is not None: 72 | with open(args.output_path, "w") as fout: 73 | json.dump(data_dict, fout, indent=4, ensure_ascii=False) 74 | else: 75 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /tests/pipeline/test_auto_pipeline.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from lmflow.args import DatasetArguments 4 | from lmflow.args import EvaluatorArguments 5 | from lmflow.args import FinetunerArguments 6 | from lmflow.args import InferencerArguments 7 | from lmflow.args import ModelArguments 8 | from lmflow.args import RaftAlignerArguments 9 | from lmflow.pipeline.auto_pipeline import AutoPipeline 10 | from lmflow.pipeline.evaluator import Evaluator 11 | from lmflow.pipeline.finetuner import Finetuner 12 | from lmflow.pipeline.inferencer import Inferencer 13 | from lmflow.pipeline.raft_aligner import RaftAligner 14 | 15 | MODEL_NAME = "gpt2" 16 | 17 | 18 | class AutoPipelineTest(unittest.TestCase): 19 | 20 | def test_get_evaluator_pipeline(self): 21 | model_args = ModelArguments(model_name_or_path=MODEL_NAME) 22 | dataset_args = DatasetArguments() 23 | evaluator_args = EvaluatorArguments() 24 | pipeline = AutoPipeline.get_pipeline( 25 | "evaluator", model_args, dataset_args, evaluator_args) 26 | 27 | self.assertTrue(isinstance(pipeline, Evaluator)) 28 | 29 | def test_get_finetuner_pipeline(self): 30 | model_args = ModelArguments(model_name_or_path=MODEL_NAME) 31 | dataset_args = DatasetArguments() 32 | finetuner_args = FinetunerArguments(output_dir="~/tmp") 33 | pipeline = AutoPipeline.get_pipeline( 34 | "finetuner", model_args, dataset_args, finetuner_args) 35 | 36 | self.assertTrue(isinstance(pipeline, Finetuner)) 37 | 38 | def test_get_inferencer_pipeline(self): 39 | model_args = ModelArguments(model_name_or_path=MODEL_NAME) 40 | dataset_args = DatasetArguments() 41 | inferencer_args = InferencerArguments() 42 | pipeline = AutoPipeline.get_pipeline( 43 | "inferencer", model_args, dataset_args, inferencer_args) 44 | 45 | self.assertTrue(isinstance(pipeline, Inferencer)) 46 | 47 | def test_get_raft_aligner_pipeline(self): 48 | model_args = ModelArguments(model_name_or_path=MODEL_NAME) 49 | dataset_args = DatasetArguments() 50 | raft_aligner_args = RaftAlignerArguments(output_dir="~/tmp") 51 | pipeline = AutoPipeline.get_pipeline( 52 | "raft_aligner", model_args, dataset_args, raft_aligner_args) 53 | 54 | self.assertTrue(isinstance(pipeline, RaftAligner)) 55 | 56 | def test_get_unsupported_pipeline(self): 57 | model_args = ModelArguments(model_name_or_path=MODEL_NAME) 58 | dataset_args = DatasetArguments() 59 | 60 | with self.assertRaisesRegex(NotImplementedError, "Pipeline \"unsupported\" is not supported"): 61 | pipeline = AutoPipeline.get_pipeline( 62 | "unsupported", model_args, dataset_args, None) 63 | -------------------------------------------------------------------------------- /scripts/data_preprocess/add_end_mark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """ 5 | Adds prompt structure to a text2text dataset. 6 | """ 7 | from __future__ import absolute_import 8 | 9 | import argparse 10 | import json 11 | import textwrap 12 | import sys 13 | 14 | def parse_argument(sys_argv): 15 | """Parses arguments from command line. 16 | Args: 17 | sys_argv: the list of arguments (strings) from command line. 18 | Returns: 19 | A struct whose member corresponds to the required (optional) variable. 20 | For example, 21 | ``` 22 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10']) 23 | args.input # 'a.txt' 24 | args.num # 10 25 | ``` 26 | """ 27 | parser = argparse.ArgumentParser( 28 | formatter_class=argparse.RawTextHelpFormatter) 29 | 30 | # Training parameters 31 | parser.add_argument( 32 | "--dataset_path", type=str, 33 | default=None, 34 | help=textwrap.dedent("input dataset path, reads from stdin by default") 35 | ) 36 | parser.add_argument( 37 | "--output_path", type=str, 38 | default=None, 39 | help=textwrap.dedent("output dataset path, writes to stdout by default") 40 | ) 41 | parser.add_argument( 42 | "--end_mark", type=str, 43 | default="###", 44 | help=textwrap.dedent("end mark that append to the end of output") 45 | ) 46 | 47 | # Parses from commandline 48 | args = parser.parse_args(sys_argv[1:]) 49 | 50 | return args 51 | 52 | 53 | def main(): 54 | args = parse_argument(sys.argv) 55 | if args.dataset_path is not None: 56 | with open(args.dataset_path, "r") as fin: 57 | data_dict = json.load(fin) 58 | else: 59 | data_dict = json.load(sys.stdin) 60 | 61 | output_field_map = { 62 | "text_only": "text", 63 | "text2text": "output", 64 | } 65 | data_dict_type = data_dict["type"] 66 | if not data_dict_type in output_field_map: 67 | raise NotImplementedError( 68 | "only support text_only or text2text dataset" 69 | ) 70 | 71 | output_field = output_field_map[data_dict_type] 72 | 73 | num_instances = len(data_dict["instances"]) 74 | for i in range(num_instances): 75 | data_dict["instances"][i][output_field] += args.end_mark 76 | 77 | if args.output_path is not None: 78 | with open(args.output_path, "w") as fout: 79 | json.dump(data_dict, fout, indent=4, ensure_ascii=False) 80 | else: 81 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Docker 2 | 3 | LMFlow is available as a docker image in Docker Hub, built from the Dockerfile 4 | in this directory, with cuda:11.3.0-cudnn8 (source docker: 5 | nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04). You need to have at least a 6 | Nvidia 3090 GPU on your machine with cuda driver compatible with cuda:11.3.0 to 7 | run this docker image. 8 | 9 | ## Install docker with nvidia support 10 | 11 | First you may need to install docker with nvidia support. This step requires 12 | root permission. If you don't have one, you may need to contact the system 13 | adminstrator to do that for you. 14 | 15 | We provide an example in Ubuntu 20.04. For other operating systems, you may 16 | refer to Nvidia's [Install 17 | Guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker). 18 | 19 | ```sh 20 | curl https://get.docker.com | sh && sudo systemctl --now enable docker 21 | 22 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ 23 | && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ 24 | | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ 25 | && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list \ 26 | | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ 27 | | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list 28 | 29 | sudo apt-get update 30 | sudo apt-get install -y nvidia-container-toolkit 31 | sudo nvidia-ctk runtime configure --runtime=docker 32 | sudo systemctl restart docker 33 | ``` 34 | 35 | ## Pull docker image and run 36 | 37 | Use the following command to pull our docker image. 38 | 39 | ```sh 40 | docker pull optimalscale/lmflow 41 | ``` 42 | 43 | The working directory in docker is `/LMFlow`, where LMFlow (commit: 44 | [fa0e66f94](https://github.com/OptimalScale/LMFlow/tree/fa0e66f94eb5b7bfd624afdf9826b054641e3373)) 45 | is cloned and installed. Use the following command to enter the docker 46 | container, where `./LMFlow/log/finetune` in the container will be mapped to 47 | `./output_dir/log/finetune` on the host machine. You may add more directory 48 | mappings in a similar manner. 49 | 50 | ```sh 51 | docker run \ 52 | -v ./output_dir/log/finetune:/LMFlow/log/finetune \ 53 | --gpus=all \ 54 | --shm-size=64g \ 55 | -e WANDB_DISABLED=true \ 56 | -it \ 57 | --rm \ 58 | optimalscale/lmflow \ 59 | bash 60 | ``` 61 | 62 | Then you will be able to work inside the docker, just like in a physical 63 | machine. Notice that to use multiple gpus, you need to allocate enough 64 | shared memory. We have setup the dependency for you, so you can directly 65 | run our scripts, e.g. 66 | 67 | ``` 68 | ./scripts/run_chatbot.sh 69 | ./scripts/run_evaluation.sh 70 | 71 | # May need a GPU with --bf16 support, or you can remove --bf16 72 | # and use --fp16 instead 73 | ./scripts/run_finetune.sh 74 | ``` 75 | -------------------------------------------------------------------------------- /scripts/data_preprocess/merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """ 5 | Merges an extra dataset into current dataset. 6 | """ 7 | from __future__ import absolute_import 8 | 9 | import argparse 10 | import json 11 | import textwrap 12 | import sys 13 | 14 | def parse_argument(sys_argv): 15 | """Parses arguments from command line. 16 | Args: 17 | sys_argv: the list of arguments (strings) from command line. 18 | Returns: 19 | A struct whose member corresponds to the required (optional) variable. 20 | For example, 21 | ``` 22 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10']) 23 | args.input # 'a.txt' 24 | args.num # 10 25 | ``` 26 | """ 27 | parser = argparse.ArgumentParser( 28 | formatter_class=argparse.RawTextHelpFormatter) 29 | 30 | parser.add_argument( 31 | "--dataset_path", type=str, 32 | default=None, 33 | help=textwrap.dedent("input dataset path, reads from stdin by default") 34 | ) 35 | # Training parameters 36 | parser.add_argument( 37 | "--output_path", type=str, 38 | default=None, 39 | help=textwrap.dedent("output dataset path, writes to stdout by default") 40 | ) 41 | parser.add_argument( 42 | "--merge_from_path", type=str, 43 | nargs="+", 44 | help=textwrap.dedent( 45 | "dataset path of the extra dataset that will be merged" 46 | " into input dataset" 47 | ) 48 | ) 49 | 50 | # Parses from commandline 51 | args = parser.parse_args(sys_argv[1:]) 52 | 53 | return args 54 | 55 | 56 | def main(): 57 | args = parse_argument(sys.argv) 58 | 59 | if args.dataset_path is not None: 60 | with open(args.dataset_path, "r") as fin: 61 | data_dict = json.load(fin) 62 | else: 63 | data_dict = json.load(sys.stdin) 64 | 65 | if args.merge_from_path is not None: 66 | for i in range(0, len(args.merge_from_path)): 67 | with open(args.merge_from_path[i], "r") as fin: 68 | extra_data_dict = json.load(fin) 69 | 70 | if data_dict["type"] != extra_data_dict["type"]: 71 | raise ValueError( 72 | 'two dataset have different types:' 73 | f' input dataset: "{data_dict["type"]}";' 74 | f' merge from dataset: "{extra_data_dict["type"]}"' 75 | ) 76 | data_dict["instances"].extend(extra_data_dict["instances"]) 77 | 78 | 79 | if args.output_path is not None: 80 | with open(args.output_path, "w") as fout: 81 | json.dump(data_dict, fout, indent=4, ensure_ascii=False) 82 | else: 83 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False) 84 | 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | _build 8 | 9 | # C extensions 10 | *.so 11 | 12 | # tests and logs 13 | tests/fixtures/cached_*_text.txt 14 | logs/ 15 | lightning_logs/ 16 | lang_code_data/ 17 | log/ 18 | regression_test/*/new_output_models 19 | regression_test/*/new_log 20 | output_dir/ 21 | 22 | # data files 23 | data/ 24 | 25 | # output models 26 | output_models/ 27 | 28 | # Distribution / packaging 29 | .Python 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | MANIFEST 46 | 47 | # PyInstaller 48 | # Usually these files are written by a python script from a template 49 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 50 | *.manifest 51 | *.spec 52 | 53 | # Installer logs 54 | pip-log.txt 55 | pip-delete-this-directory.txt 56 | 57 | # Unit test / coverage reports 58 | htmlcov/ 59 | .tox/ 60 | .nox/ 61 | .coverage 62 | .coverage.* 63 | .cache 64 | nosetests.xml 65 | coverage.xml 66 | *.cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | db.sqlite3 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | 89 | # PyBuilder 90 | target/ 91 | 92 | # Jupyter Notebook 93 | .ipynb_checkpoints 94 | 95 | # IPython 96 | profile_default/ 97 | ipython_config.py 98 | 99 | # pyenv 100 | .python-version 101 | 102 | # celery beat schedule file 103 | celerybeat-schedule 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # vscode 136 | .vs 137 | .vscode 138 | 139 | # Pycharm 140 | .idea 141 | 142 | # TF code 143 | tensorflow_code 144 | 145 | # Models 146 | proc_data 147 | 148 | # examples 149 | runs 150 | /runs_old 151 | /wandb 152 | /examples/runs 153 | /examples/**/*.args 154 | /examples/rag/sweep 155 | 156 | # data 157 | # /data 158 | serialization_dir 159 | 160 | # emacs 161 | *.*~ 162 | debug.env 163 | 164 | # vim 165 | .*.swp 166 | 167 | #ctags 168 | tags 169 | 170 | # pre-commit 171 | .pre-commit* 172 | 173 | # .lock 174 | *.lock 175 | 176 | # DS_Store (MacOS) 177 | .DS_Store 178 | 179 | # ruff 180 | .ruff_cache 181 | 182 | # lm_evaluation cache 183 | lm_cache/ 184 | -------------------------------------------------------------------------------- /docs/source/_static/logo6.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/merge_tokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import argparse 5 | import logging 6 | import os 7 | 8 | from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model 9 | import sentencepiece as spm 10 | 11 | import torch 12 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 13 | 14 | from transformers import AutoTokenizer 15 | 16 | logging.basicConfig(level=logging.INFO) 17 | 18 | if __name__ == '__main__': 19 | 20 | os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python" 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--tokenizer_dir', default='pinkmanlove/llama-7b-hf', type=str, required=False) 24 | parser.add_argument('--chinese_sp_model_file', default='./output_models/new_tokenizer/example.model', type=str) 25 | parser.add_argument('--output_dir', default='./output_models/merged_tokenizer', type=str, required=False) 26 | args = parser.parse_args() 27 | 28 | tokenizer_dir = args.tokenizer_dir 29 | chinese_sp_model_file = args.chinese_sp_model_file 30 | output_dir = args.output_dir 31 | 32 | # load 33 | old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir) 34 | chinese_sp_model = spm.SentencePieceProcessor() 35 | chinese_sp_model.Load(chinese_sp_model_file) 36 | 37 | old_spm = sp_pb2_model.ModelProto() 38 | old_spm.ParseFromString(old_tokenizer.sp_model.serialized_model_proto()) 39 | chinese_spm = sp_pb2_model.ModelProto() 40 | chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto()) 41 | 42 | ## Add Chinese tokens to old tokenizer 43 | old_spm_tokens_set=set(p.piece for p in old_spm.pieces) 44 | for p in chinese_spm.pieces: 45 | piece = p.piece 46 | if piece not in old_spm_tokens_set: 47 | new_p = sp_pb2_model.ModelProto().SentencePiece() 48 | new_p.piece = piece 49 | new_p.score = 0 50 | old_spm.pieces.append(new_p) 51 | 52 | ## Save 53 | output_sp_dir = output_dir + '/merged_tokenizer_sp' 54 | output_hf_dir = output_dir + '/merged_tokenizer_hf' # the path to save tokenizer 55 | os.makedirs(output_sp_dir,exist_ok=True) 56 | with open(output_sp_dir+'/merged_tokenizer.model', 'wb') as f: 57 | f.write(old_spm.SerializeToString()) 58 | 59 | tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_dir,vocab_file=output_sp_dir+'/merged_tokenizer.model') 60 | 61 | tokenizer.save_pretrained(output_hf_dir) 62 | logging.info(f"Merged tokenizer has been saved to %s",output_dir) 63 | 64 | 65 | # Test 66 | old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir) 67 | new_tokenizer = AutoTokenizer.from_pretrained(output_hf_dir) 68 | logging.info(f"Old tokenizer vocab size: %d",len(old_tokenizer)) 69 | logging.info(f"New tokenizer vocab size: %d",len(new_tokenizer)) 70 | 71 | text='''白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 72 | The primary use of LLaMA is research on large language models, including''' 73 | logging.info(f"Test text:\n %s",text) 74 | logging.info(f"Tokenized by LLaMA tokenizer:%s",old_tokenizer.tokenize(text)) 75 | logging.info(f"Tokenized by Chinese-LLaMA tokenizer:%s",new_tokenizer.tokenize(text)) -------------------------------------------------------------------------------- /src/lmflow/pipeline/utils/peft_trainer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """Trainer for Peft models 4 | """ 5 | 6 | from __future__ import absolute_import 7 | from transformers import Trainer 8 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 9 | from transformers.trainer_callback import ( 10 | TrainerCallback, 11 | TrainerControl, 12 | TrainerState, 13 | ) 14 | from transformers.training_args import TrainingArguments 15 | import os 16 | import numpy as np 17 | 18 | class PeftTrainer(Trainer): 19 | def _save_checkpoint(self, _, trial, metrics=None): 20 | """ Don't save base model, optimizer etc. 21 | but create checkpoint folder (needed for saving adapter) """ 22 | checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" 23 | 24 | run_dir = self._get_output_dir(trial=trial) 25 | output_dir = os.path.join(run_dir, checkpoint_folder) 26 | 27 | if metrics is not None and self.args.metric_for_best_model is not None: 28 | metric_to_check = self.args.metric_for_best_model 29 | if not metric_to_check.startswith("eval_"): 30 | metric_to_check = f"eval_{metric_to_check}" 31 | metric_value = metrics[metric_to_check] 32 | 33 | operator = np.greater if self.args.greater_is_better else np.less 34 | if (self.state.best_metric is None or self.state.best_model_checkpoint is None 35 | or operator(metric_value, self.state.best_metric)): 36 | self.state.best_metric = metric_value 37 | 38 | self.state.best_model_checkpoint = output_dir 39 | 40 | os.makedirs(output_dir, exist_ok=True) 41 | 42 | if self.args.should_save: 43 | self._rotate_checkpoints(use_mtime=True, output_dir=run_dir) 44 | 45 | class PeftSavingCallback(TrainerCallback): 46 | """ Correctly save PEFT model and not full model """ 47 | def _save(self, model, folder): 48 | if folder is None: 49 | folder = "" 50 | peft_model_path = os.path.join(folder, "adapter_model") 51 | model.save_pretrained(peft_model_path) 52 | 53 | def on_train_end(self, args: TrainingArguments, state: TrainerState, 54 | control: TrainerControl, **kwargs): 55 | """ Save final best model adapter """ 56 | self._save(kwargs['model'], state.best_model_checkpoint) 57 | 58 | def on_epoch_end(self, args: TrainingArguments, state: TrainerState, 59 | control: TrainerControl, **kwargs): 60 | """ Save intermediate model adapters in case of interrupted training """ 61 | folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}") 62 | self._save(kwargs['model'], folder) 63 | 64 | def on_save( 65 | self, 66 | args: TrainingArguments, 67 | state: TrainerState, 68 | control: TrainerControl, 69 | **kwargs, 70 | ): 71 | checkpoint_folder = os.path.join( 72 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}" 73 | ) 74 | self._save(kwargs['model'], checkpoint_folder) 75 | 76 | peft_model_path = os.path.join(checkpoint_folder, "adapter_model") 77 | kwargs["model"].save_pretrained(peft_model_path) 78 | return control -------------------------------------------------------------------------------- /docs/source/examples/DATASETS.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | 3 | We provide several available datasets under `data`. You may download them all by running: 4 | ```sh 5 | cd data && ./download.sh all && cd - 6 | ``` 7 | You can replace `all` with a specific dataset name to only download that dataset (e.g. `./download.sh alpaca`). 8 | 9 | Customized datasets are strongly encouraged, since this way users can apply 10 | their own prompt engineering techniques over various source datasets. As long 11 | as the generated dataset following the format below, they can be accepted as 12 | the input of our pipelines :hugs: 13 | 14 | 15 | ## Dataset Format in General 16 | 17 | To specify the input for model finetune, users can provide a list of `.json` 18 | files under a specified dataset directory. For example, 19 | 20 | ```sh 21 | |- path_to_dataset 22 | |- data_1.json 23 | |- data_2.json 24 | |- another_data.json 25 | |- ... 26 | ``` 27 | 28 | For inference, we currently only support a single `.json` file. 29 | 30 | Each json file shall have the following format (three instances with four keys 31 | for example), 32 | 33 | ```json 34 | { 35 | "type": "TYPE", 36 | "instances": [ 37 | { 38 | "KEY_1": "VALUE_1.1", 39 | "KEY_2": "VALUE_1.2", 40 | "KEY_3": "VALUE_1.3", 41 | "KEY_4": "VALUE_1.4", 42 | }, 43 | { 44 | "KEY_1": "VALUE_2.1", 45 | "KEY_2": "VALUE_2.2", 46 | "KEY_3": "VALUE_2.3", 47 | "KEY_4": "VALUE_2.4", 48 | }, 49 | { 50 | "KEY_1": "VALUE_3.1", 51 | "KEY_2": "VALUE_3.2", 52 | "KEY_3": "VALUE_3.3", 53 | "KEY_4": "VALUE_3.4", 54 | }, 55 | ] 56 | } 57 | ``` 58 | 59 | where the `TYPE` indicates the dataset type and defines the set of keys 60 | `{ KEY_1, KEY_2, ... }` and their corresponding interpretations. The list of 61 | supported types are listed as follows. 62 | 63 | ## Supported Dataset and Detailed Formats 64 | 65 | ### TextOnly 66 | 67 | This is the most common dataset type, which only contains raw texts in each 68 | sample. This type of dataset can be used as the training set for text decoder 69 | models, or the input of decoder models / encoder-decoder models. Its format is 70 | as follows (three instances for example), 71 | 72 | ```json 73 | { 74 | "type": "text_only", 75 | "instances": [ 76 | { "text": "SAMPLE_TEXT_1" }, 77 | { "text": "SAMPLE_TEXT_2" }, 78 | { "text": "SAMPLE_TEXT_3" }, 79 | ] 80 | } 81 | ``` 82 | 83 | For example, `data/example_dataset/train/train_50.json` has the aboved format. 84 | 85 | ### Text2Text 86 | 87 | This is the dataset type mostly used for inferencing, which contains a pair of 88 | texts in each sample. This type of dataset can be used as the training set for 89 | text encoder-decoder models, or question-answer pair for evaluating model 90 | inferences. Its format is as follows (three instances for example), 91 | 92 | ```json 93 | { 94 | "type": "text2text", 95 | "instances": [ 96 | { 97 | "input": "SAMPLE_INPUT_1", 98 | "output": "SAMPLE_OUTPUT_1", 99 | }, 100 | { 101 | "input": "SAMPLE_INPUT_2", 102 | "output": "SAMPLE_OUTPUT_2", 103 | }, 104 | { 105 | "input": "SAMPLE_INPUT_3", 106 | "output": "SAMPLE_OUTPUT_3", 107 | }, 108 | ] 109 | } 110 | ``` 111 | 112 | For example, `data/example_dataset/test/test_13.json` has the aboved format. 113 | -------------------------------------------------------------------------------- /src/lmflow/utils/position_interpolation/llama_rope_scaled_monkey_patch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | import transformers.models.llama.modeling_llama 4 | 5 | 6 | class ScaledRotaryEmbedding(torch.nn.Module): 7 | def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): 8 | super().__init__() 9 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) 10 | self.register_buffer("inv_freq", inv_freq) 11 | 12 | self.scale = 4 13 | max_position_embeddings = max_position_embeddings * self.scale 14 | 15 | # Build here to make `torch.jit.trace` work. 16 | self.max_seq_len_cached = max_position_embeddings 17 | t = torch.arange( 18 | self.max_seq_len_cached, 19 | device=self.inv_freq.device, 20 | dtype=self.inv_freq.dtype, 21 | ) 22 | 23 | 24 | t /= self.scale 25 | 26 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 27 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 28 | emb = torch.cat((freqs, freqs), dim=-1) 29 | 30 | #self.window = torch.hann_window(emb.shape[-1]) 31 | self.register_buffer( 32 | "cos_cached", (emb.cos())[None, None, :, :], persistent=False 33 | ) 34 | self.register_buffer( 35 | "sin_cached", (emb.sin())[None, None, :, :], persistent=False 36 | ) 37 | 38 | def forward(self, x, seq_len=None): 39 | # x: [bs, num_attention_heads, seq_len, head_size] 40 | # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. 41 | if seq_len > self.max_seq_len_cached: 42 | self.max_seq_len_cached = seq_len 43 | t = torch.arange( 44 | self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype 45 | ) 46 | t *= self.scale 47 | freqs = torch.einsum("i,j->ij", t, self.inv_freq) 48 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 49 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 50 | self.register_buffer( 51 | "cos_cached", emb.cos()[None, None, :, :], persistent=False 52 | ) 53 | self.register_buffer( 54 | "sin_cached", emb.sin()[None, None, :, :], persistent=False 55 | ) 56 | return ( 57 | self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), 58 | self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), 59 | ) 60 | 61 | old_init = transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.__init__ 62 | 63 | def ntk_scaled_init(self, dim, max_position_embeddings=2048, base=10000, device=None): 64 | #The method is just these three lines 65 | a = 8 #Alpha value 66 | max_position_embeddings = max_position_embeddings * a 67 | 68 | base = base * a ** (dim / (dim-2)) #Base change formula 69 | 70 | old_init(self, dim, max_position_embeddings, base, device) 71 | 72 | def replace_llama_rope_with_scaled_rope(): 73 | transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = ( 74 | ScaledRotaryEmbedding 75 | ) 76 | 77 | def repalce_llama_rope_init_with_scaled_rope_init(): 78 | transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.__init__ = ntk_scaled_init -------------------------------------------------------------------------------- /experimental/RAFT-diffusion/README.md: -------------------------------------------------------------------------------- 1 | # RAFT-Diffusion 2 | 3 | 4 | In this folder, we provide an example to show that how does RAFT work on diffusion models. We will also include these script into LMFlow framework into the LMFlow APIs in the future. 5 | 6 | 7 | The requirements are shown below. 8 | ``` 9 | accelerate 0.18.0 10 | asttokens 2.2.1 11 | backcall 0.2.0 12 | bitsandbytes 0.37.2 13 | certifi 2022.12.7 14 | charset-normalizer 3.1.0 15 | clip 1.0 16 | cmake 3.26.1 17 | comm 0.1.3 18 | contourpy 1.0.7 19 | cycler 0.11.0 20 | debugpy 1.6.7 21 | decorator 5.1.1 22 | diffusers 0.14.0 23 | executing 1.2.0 24 | filelock 3.11.0 25 | fonttools 4.39.3 26 | ftfy 6.1.1 27 | huggingface-hub 0.13.4 28 | idna 3.4 29 | importlib-metadata 6.2.0 30 | importlib-resources 5.12.0 31 | ipykernel 6.22.0 32 | ipython 8.12.0 33 | jedi 0.18.2 34 | Jinja2 3.1.2 35 | jupyter_client 8.1.0 36 | jupyter_core 5.3.0 37 | kiwisolver 1.4.4 38 | lit 16.0.0 39 | MarkupSafe 2.1.2 40 | matplotlib 3.7.1 41 | matplotlib-inline 0.1.6 42 | mpmath 1.3.0 43 | mypy-extensions 1.0.0 44 | nest-asyncio 1.5.6 45 | networkx 3.1 46 | numpy 1.24.2 47 | nvidia-cublas-cu11 11.10.3.66 48 | nvidia-cuda-cupti-cu11 11.7.101 49 | nvidia-cuda-nvrtc-cu11 11.7.99 50 | nvidia-cuda-runtime-cu11 11.7.99 51 | nvidia-cudnn-cu11 8.5.0.96 52 | nvidia-cufft-cu11 10.9.0.58 53 | nvidia-curand-cu11 10.2.10.91 54 | nvidia-cusolver-cu11 11.4.0.1 55 | nvidia-cusparse-cu11 11.7.4.91 56 | nvidia-nccl-cu11 2.14.3 57 | nvidia-nvtx-cu11 11.7.91 58 | open-clip-torch 2.16.0 59 | packaging 23.0 60 | pandas 2.0.0 61 | parso 0.8.3 62 | pexpect 4.8.0 63 | pickleshare 0.7.5 64 | Pillow 9.5.0 65 | pip 23.0.1 66 | platformdirs 3.2.0 67 | prompt-toolkit 3.0.38 68 | protobuf 3.20.3 69 | psutil 5.9.4 70 | ptyprocess 0.7.0 71 | pure-eval 0.2.2 72 | Pygments 2.14.0 73 | pyparsing 3.0.9 74 | pyre-extensions 0.0.23 75 | python-dateutil 2.8.2 76 | pytz 2023.3 77 | PyYAML 6.0 78 | pyzmq 25.0.2 79 | regex 2023.3.23 80 | requests 2.28.2 81 | sentencepiece 0.1.97 82 | setuptools 65.6.3 83 | six 1.16.0 84 | stack-data 0.6.2 85 | sympy 1.11.1 86 | timm 0.6.13 87 | tokenizers 0.13.3 88 | torch 2.0.0 89 | torchvision 0.15.1 90 | tornado 6.2 91 | tqdm 4.65.0 92 | traitlets 5.9.0 93 | transformers 4.27.4 94 | triton 2.0.0 95 | typing_extensions 4.5.0 96 | typing-inspect 0.8.0 97 | tzdata 2023.3 98 | urllib3 1.26.15 99 | wcwidth 0.2.6 100 | wheel 0.38.4 101 | xformers 0.0.18 102 | zipp 3.15.0 103 | ``` 104 | 105 | We will also add a COLAB link for convenience. 106 | -------------------------------------------------------------------------------- /utils/lm_evaluator.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import fnmatch 5 | 6 | from lm_eval import tasks, evaluator 7 | 8 | logging.getLogger("openai").setLevel(logging.WARNING) 9 | 10 | 11 | class MultiChoice: 12 | def __init__(self, choices): 13 | self.choices = choices 14 | 15 | # Simple wildcard support (linux filename patterns) 16 | def __contains__(self, values): 17 | for value in values.split(","): 18 | if len(fnmatch.filter(self.choices, value)) == 0: 19 | return False 20 | 21 | return True 22 | 23 | def __iter__(self): 24 | for choice in self.choices: 25 | yield choice 26 | 27 | 28 | def parse_args(): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--model", required=True) 31 | parser.add_argument("--model_args", default="") 32 | parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS)) 33 | parser.add_argument("--provide_description", action="store_true") 34 | parser.add_argument("--num_fewshot", type=int, default=0) 35 | parser.add_argument("--batch_size", type=int, default=None) 36 | parser.add_argument("--device", type=str, default=None) 37 | parser.add_argument("--output_path", default=None) 38 | parser.add_argument("--limit", type=int, default=None) 39 | parser.add_argument("--no_cache", action="store_true") 40 | parser.add_argument("--decontamination_ngrams_path", default=None) 41 | parser.add_argument("--description_dict_path", default=None) 42 | parser.add_argument("--check_integrity", action="store_true") 43 | 44 | return parser.parse_args() 45 | 46 | 47 | # Returns a list containing all values of the source_list that 48 | # match at least one of the patterns 49 | def pattern_match(patterns, source_list): 50 | task_names = set() 51 | for pattern in patterns: 52 | for matching in fnmatch.filter(source_list, pattern): 53 | task_names.add(matching) 54 | return list(task_names) 55 | 56 | 57 | def main(): 58 | args = parse_args() 59 | 60 | assert not args.provide_description # not implemented 61 | 62 | if args.limit: 63 | print( 64 | "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." 65 | ) 66 | 67 | if args.tasks is None: 68 | task_names = tasks.ALL_TASKS 69 | else: 70 | task_names = pattern_match(args.tasks.split(","), tasks.ALL_TASKS) 71 | 72 | print(f"Selected Tasks: {task_names}") 73 | 74 | description_dict = {} 75 | if args.description_dict_path: 76 | with open(args.description_dict_path, "r") as f: 77 | description_dict = json.load(f) 78 | 79 | results = evaluator.simple_evaluate( 80 | model=args.model, 81 | model_args=args.model_args, 82 | tasks=task_names, 83 | num_fewshot=args.num_fewshot, 84 | batch_size=args.batch_size, 85 | device=args.device, 86 | no_cache=args.no_cache, 87 | limit=args.limit, 88 | description_dict=description_dict, 89 | decontamination_ngrams_path=args.decontamination_ngrams_path, 90 | check_integrity=args.check_integrity, 91 | ) 92 | 93 | dumped = json.dumps(results, indent=2) 94 | print(dumped) 95 | 96 | if args.output_path: 97 | with open(args.output_path, "w") as f: 98 | f.write(dumped) 99 | 100 | print( 101 | f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, " 102 | f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}" 103 | ) 104 | print(evaluator.make_table(results)) 105 | 106 | 107 | if __name__ == "__main__": 108 | main() 109 | -------------------------------------------------------------------------------- /src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import torch 4 | import transformers 5 | from einops import rearrange 6 | from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func 7 | from flash_attn.bert_padding import unpad_input, pad_input 8 | 9 | def _attn(self, query, key, value, attention_mask=None, head_mask=None): 10 | # (batch, head, seq_length, head_features) 11 | query = query.to(torch.bfloat16) 12 | key = key.to(torch.bfloat16) 13 | query = query * torch.sqrt(torch.tensor(self.head_dim)) 14 | qkv = torch.stack( 15 | [query, key, value], dim=2 16 | )# [bsz, nh, 3, t, hd] 17 | qkv = qkv.transpose(1,3)## [bsz, q_len, 3, nh, hd] 18 | bsz = qkv.shape[0] 19 | q_len = qkv.shape[1] 20 | 21 | attention_mask = torch.where(attention_mask == -0.0, True, False) 22 | key_padding_mask = rearrange(attention_mask, "b () () s -> b s") if attention_mask is not None else None 23 | if key_padding_mask is None: 24 | qkv = rearrange(qkv, "b s ... -> (b s) ...") 25 | max_s = q_len 26 | cu_q_lens = torch.arange( 27 | 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device 28 | ) 29 | output = flash_attn_unpadded_qkvpacked_func( 30 | qkv, cu_q_lens, max_s, self.attn_dropout.p if self.training else 0.0 , softmax_scale=None, causal=True 31 | )# attention compute 32 | output = rearrange(output, "(b s) ... -> b s ...", b=bsz) 33 | else: 34 | nheads = qkv.shape[-2] 35 | x = rearrange(qkv, "b s three h d -> b s (three h d)") 36 | x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask) 37 | x_unpad = rearrange( 38 | x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads 39 | ) 40 | output_unpad = flash_attn_unpadded_qkvpacked_func( 41 | x_unpad, cu_q_lens, max_s, self.attn_dropout.p if self.training else 0.0, softmax_scale=None, causal=True 42 | ) 43 | output = rearrange( 44 | pad_input( 45 | rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len 46 | ), 47 | "b s (h d) -> b s h d", 48 | h=nheads, 49 | ) 50 | 51 | return output, None 52 | 53 | def forward( 54 | self, 55 | hidden_states, 56 | attention_mask=None, 57 | layer_past=None, 58 | head_mask=None, 59 | use_cache=False, 60 | output_attentions=False, 61 | ): 62 | 63 | assert head_mask is None, "head_mask is not supported" 64 | assert not output_attentions, "output_attentions is not supported" 65 | assert not use_cache, "use_cache is not supported" 66 | 67 | query = self.q_proj(hidden_states) 68 | key = self.k_proj(hidden_states) 69 | value = self.v_proj(hidden_states) 70 | 71 | query = self._split_heads(query, self.num_heads, self.head_dim) 72 | key = self._split_heads(key, self.num_heads, self.head_dim) 73 | value = self._split_heads(value, self.num_heads, self.head_dim) 74 | 75 | if layer_past is not None: 76 | past_key = layer_past[0] 77 | past_value = layer_past[1] 78 | key = torch.cat((past_key, key), dim=-2) 79 | value = torch.cat((past_value, value), dim=-2) 80 | 81 | present = None 82 | attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) 83 | new_shape = attn_output.size()[:-2] + (self.num_heads * self.head_dim,) 84 | attn_output = attn_output.view(new_shape) 85 | attn_output = self.out_proj(attn_output) 86 | attn_output = self.resid_dropout(attn_output) 87 | 88 | outputs = (attn_output, present) 89 | 90 | return outputs # a, present, (attentions) 91 | 92 | def replace_gpt_neo_attn_with_flash_attn(): 93 | transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention._attn = _attn 94 | transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention.forward = forward -------------------------------------------------------------------------------- /docs/source/_static/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/source/examples/TASK_GUIDE.md: -------------------------------------------------------------------------------- 1 | # LMFlow Benchmark Guide 2 | 3 | We support two ways to add evaluation settings in our repo, `NLL Task Setting` and `LM-Evaluation Task Setting`. Below are the details of them: 4 | 5 | # 1. NLL Task Setting 6 | Users can easily create new tasks and evaluate their datasets on 7 | the provide `nll (Negative Log Likelihood)` metric. 8 | 9 | ## Setup 10 | 11 | Fork the main repo, clone it, and create a new branch with the name of 12 | your task, and install the following: 13 | 14 | ```bash 15 | # After forking... 16 | git clone https://github.com//LMFlow.git 17 | cd LMFlow 18 | git checkout -b 19 | conda create -n lmflow python=3.9 -y 20 | conda activate lmflow 21 | conda install mpi4py 22 | pip install -e . 23 | ``` 24 | ## Create Your Task Dataset File 25 | We provide several available datasets under `data` after running 26 | ```sh 27 | cd data && ./download.sh && cd - 28 | ``` 29 | 30 | You can refer to some given evaluation dataset files and create your own. 31 | Also, you may refer to our guide on 32 | [DATASET](https://optimalscale.github.io/LMFlow/examples/DATASETS.html). 33 | 34 | In this step, you will need to decide your answer type like `text2text` 35 | or `text_only` (Notice that the current `nll` implementation only supports these 36 | two answer types). We will note the chosen answer type as ``. 37 | 38 | After preparing your own `DATASET` file, you can put it under `data` dir 39 | and make a `TASK` dir. 40 | 41 | ```bash 42 | mkdir 43 | mv 44 | ``` 45 | 46 | ## Task Registration 47 | 48 | Note the path of your dataset, `data//`. 49 | 50 | Open the file `examples/benchmarking.py`, add your task's info into 51 | `LOCAL_DATSET_GROUP_MAP`, `LOCAL_DATSET_MAP`, `LOCAL_DATSET_ANSWERTYPE_MAP` 52 | 53 | In `LOCAL_DATSET_MAP`, you will need to specify your `DATASET` files' path: 54 | 55 | ```python 56 | LOCAL_DATSET_MAP ={ 57 | "...":"...", 58 | "":"data//", 59 | } 60 | ``` 61 | 62 | In `LOCAL_DATSET_ANSWERTYPE_MAP`, you will need to specify your task's 63 | ``: 64 | 65 | ```python 66 | LOCAL_DATSET_ANSWERTYPE_MAP ={ 67 | "...":"...", 68 | "":", 69 | } 70 | ``` 71 | 72 | If you only have one task, you can add key-value pair like `"":""` 73 | in `LOCAL_DATSET_GROUP_MAP`: 74 | ```python 75 | LOCAL_DATSET_GROUP_MAP ={ 76 | "...":"...", 77 | "":"", 78 | } 79 | ``` 80 | 81 | 82 | If you want to combine several tasks, you may first specify a 83 | combination name `` and add key-value pair like 84 | `"":",,.."`in `LOCAL_DATSET_GROUP_MAP`. 85 | 86 | Remember to separate TASK by `,`: 87 | ```python 88 | LOCAL_DATSET_GROUP_MAP ={ 89 | "...":"...", 90 | "":",,..", 91 | } 92 | ``` 93 | 94 | After finishing changing these items, you can run your own `` like: 95 | 96 | ```bash 97 | deepspeed examples/benchmarking.py \ 98 | --answer_type \ 99 | --use_ram_optimized_load False \ 100 | --model_name_or_path ${model_name} \ 101 | --dataset_name data//\ 102 | --deepspeed examples/ds_config.json \ 103 | --metric nll \ 104 | --prompt_structure "###Human: {input}###Assistant:" \ 105 | | tee ${log_dir}/train.log \ 106 | 2> ${log_dir}/train.err 107 | ``` 108 | 109 | # 2. LM-Evaluation Task Setting 110 | 111 | We integrate [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) into 112 | `benchamrk.py` by directly executing the evaluate commands. Users 113 | can also use their evaluation by simply changing two items in 114 | `` of `examples/benchmarking.py`. 115 | 116 | Please refer to Eleuther's 117 | [task-table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) 118 | to get exact `` name. 119 | 120 | Similarly, you can combine several tasks, you may first specify a 121 | combination name `` and add key-value pair like 122 | `"":",,.."`in `LM_EVAL_DATASET_MAP`. 123 | 124 | Also, remember to separate TASK by `,`: 125 | 126 | ```python 127 | LM_EVAL_DATASET_MAP ={ 128 | "...":"...", 129 | "":",,..", 130 | } 131 | ``` 132 | 133 | -------------------------------------------------------------------------------- /src/lmflow/utils/flash_attention/bloom_flash_attention.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | import torch 4 | from torch import nn 5 | import torch.nn.functional as F 6 | 7 | import transformers 8 | from transformers.models.bloom.modeling_bloom import dropout_add 9 | 10 | from einops import rearrange 11 | 12 | from .triton_flash_attention import flash_attn_qkvpacked_func 13 | 14 | def forward( 15 | self, 16 | hidden_states: torch.Tensor, 17 | residual: torch.Tensor, 18 | alibi: torch.Tensor, 19 | attention_mask: torch.Tensor, 20 | layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, 21 | head_mask: Optional[torch.Tensor] = None, 22 | use_cache: bool = False, 23 | output_attentions: bool = False, 24 | ): 25 | dtype = hidden_states.dtype 26 | fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] 27 | 28 | # 3 x [batch_size, seq_length, num_heads, head_dim] 29 | (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) 30 | 31 | batch_size, q_length, _, _ = query_layer.shape 32 | bsz, q_len = batch_size, q_length 33 | 34 | if layer_past is not None: 35 | past_key, past_value = layer_past 36 | # concatenate along seq_length dimension: 37 | # - key: [batch_size * self.num_heads, head_dim, kv_length] 38 | # - value: [batch_size * self.num_heads, kv_length, head_dim] 39 | key_layer = torch.cat((past_key, key_layer), dim=2) 40 | value_layer = torch.cat((past_value, value_layer), dim=1) 41 | 42 | if use_cache is True: 43 | present = (key_layer, value_layer) 44 | else: 45 | present = None 46 | 47 | reshaped_alibi = rearrange(alibi, '(b h) one s-> b h one s', h = self.num_heads) 48 | reshaped_alibi = reshaped_alibi * self.beta 49 | 50 | attention_mask = (1.0 - attention_mask) 51 | attention_mask = attention_mask[:, None, None, :].bool() 52 | reshaped_alibi_masked = reshaped_alibi.masked_fill(attention_mask, -1e9) 53 | 54 | reshaped_query_layer = query_layer 55 | reshaped_key_layer = key_layer 56 | reshaped_value_layer = value_layer 57 | 58 | qkv = torch.concat([reshaped_query_layer.unsqueeze(2), reshaped_key_layer.unsqueeze(2), reshaped_value_layer.unsqueeze(2)], dim = 2) 59 | 60 | output = flash_attn_qkvpacked_func( 61 | qkv, reshaped_alibi_masked, True, self.inv_norm_factor 62 | ) 63 | 64 | output = rearrange(output, 'b s h d -> (b h) s d') 65 | 66 | # change view [batch_size, num_heads, q_length, head_dim] 67 | context_layer = self._merge_heads(output) 68 | 69 | # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232 70 | if self.pretraining_tp > 1 and self.slow_but_exact: 71 | slices = self.hidden_size / self.pretraining_tp 72 | output_tensor = torch.zeros_like(context_layer) 73 | for i in range(self.pretraining_tp): 74 | output_tensor = output_tensor + F.linear( 75 | context_layer[:, :, int(i * slices) : int((i + 1) * slices)], 76 | self.dense.weight[:, int(i * slices) : int((i + 1) * slices)], 77 | ) 78 | else: 79 | output_tensor = self.dense(context_layer) 80 | 81 | output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training) 82 | 83 | outputs = (output_tensor, present) 84 | if output_attentions: 85 | outputs += (context_layer,) 86 | 87 | return outputs 88 | 89 | 90 | # Disable the transformation of the attention mask in LlamaModel as the flash attention 91 | # requires the attention mask to be the same as the key_padding_mask 92 | def _prepare_attn_mask( 93 | self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int 94 | ) -> torch.BoolTensor: 95 | 96 | return attention_mask 97 | 98 | def replace_bloom_attn_with_flash_attn(): 99 | transformers.models.bloom.modeling_bloom.BloomModel._prepare_attn_mask = ( 100 | _prepare_attn_mask 101 | ) 102 | transformers.models.bloom.modeling_bloom.BloomAttention.forward = forward -------------------------------------------------------------------------------- /output_models/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function main() { 4 | public_server="http://lmflow.org:5000" 5 | if [ $# -lt 1 -o "$1" = "-h" -o "$1" = "--help" ]; then 6 | echo "Usage: bash $(basename $0) model_name" 7 | echo "Example: bash $(basename $0) instruction_ckpt" 8 | echo "Example: bash $(basename $0) all" 9 | fi 10 | 11 | if [ "$1" = "llama7b-lora-medical" -o "$1" = "medical_ckpt" -o "$1" = "all" ]; then 12 | echo "downloading llama7b-lora-medical.tar.gz" 13 | filename='llama7b-lora-medical.tar.gz' 14 | wget ${public_server}/${filename} 15 | tar zxvf ${filename} 16 | rm ${filename} 17 | fi 18 | 19 | if [ "$1" = "llama13b-lora-medical" -o "$1" = "medical_ckpt" -o "$1" = "all" ]; then 20 | echo "downloading llama13b-lora-medical.tar.gz" 21 | filename='llama13b-lora-medical.tar.gz' 22 | wget ${public_server}/${filename} 23 | tar zxvf ${filename} 24 | rm ${filename} 25 | fi 26 | 27 | if [ "$1" = "llama30b-lora-medical" -o "$1" = "medical_ckpt" -o "$1" = "all" ]; then 28 | echo "downloading llama30b-lora-medical.tar.gz" 29 | filename='llama30b-lora-medical.tar.gz' 30 | wget ${public_server}/${filename} 31 | tar zxvf ${filename} 32 | rm ${filename} 33 | fi 34 | 35 | if [ "$1" = "llama7b-lora-170k" -o "$1" = "instruction_ckpt" -o "$1" = "all" ]; then 36 | echo "downloading llama7b-lora-170k.tar.gz" 37 | filename='llama7b-lora-170k.tar.gz' 38 | wget ${public_server}/${filename} 39 | tar zxvf ${filename} 40 | rm ${filename} 41 | fi 42 | 43 | if [ "$1" = "llama7b-lora-380k" -o "$1" = "instruction_ckpt" -o "$1" = "all" ]; then 44 | echo "downloading llama7b-lora-380k.tar.gz" 45 | filename='llama7b-lora-380k.tar.gz' 46 | wget ${public_server}/${filename} 47 | tar zxvf ${filename} 48 | rm ${filename} 49 | fi 50 | 51 | if [ "$1" = "llama13b-lora-170k" -o "$1" = "instruction_ckpt" -o "$1" = "all" ]; then 52 | echo "downloading llama13b-lora-170k.tar.gz" 53 | filename='llama13b-lora-170k.tar.gz' 54 | wget ${public_server}/${filename} 55 | tar zxvf ${filename} 56 | rm ${filename} 57 | fi 58 | 59 | if [ "$1" = "llama13b-lora-380k" -o "$1" = "instruction_ckpt" -o "$1" = "all" ]; then 60 | echo "downloading llama13b-lora-380k.tar.gz" 61 | filename='llama13b-lora-380k.tar.gz' 62 | wget ${public_server}/${filename} 63 | tar zxvf ${filename} 64 | rm ${filename} 65 | fi 66 | 67 | if [ "$1" = "llama30b-lora-170k" -o "$1" = "instruction_ckpt" -o "$1" = "all" ]; then 68 | echo "downloading llama30b-lora-170k.tar.gz" 69 | filename='llama30b-lora-170k.tar.gz' 70 | wget ${public_server}/${filename} 71 | tar zxvf ${filename} 72 | rm ${filename} 73 | fi 74 | 75 | if [ "$1" = "llama7b-lora-movie-reviewer" -o "$1" = "raft_ckpt" -o "$1" = "all" ]; then 76 | echo "downloading llama7b-lora-movie-reviewer" 77 | filename='llama7b-lora-movie-reviewer.tar.gz' 78 | wget ${public_server}/${filename} 79 | tar zxvf ${filename} 80 | rm ${filename} 81 | fi 82 | 83 | if [ "$1" = "cockatoo-7b" -o "$1" = "all" ]; then 84 | echo "downloading cockatoo-7b" 85 | filename='cockatoo-7b.tar.gz' 86 | wget ${public_server}/${filename} 87 | tar zxvf ${filename} 88 | rm ${filename} 89 | fi 90 | 91 | if [ "$1" = "parakeets-2.7b" -o "$1" = "all" ]; then 92 | echo "downloading parakeets-2.7b" 93 | filename='parakeets-2.7b.tar.gz' 94 | wget ${public_server}/${filename} 95 | tar zxvf ${filename} 96 | rm ${filename} 97 | fi 98 | 99 | if [ "$1" = "robin-7b" -o "$1" = "all" ]; then 100 | echo "downloading robin-7b" 101 | filename='robin-7b-v2-delta.tar.gz' 102 | wget ${public_server}/${filename} 103 | tar zxvf ${filename} 104 | rm ${filename} 105 | fi 106 | 107 | if [ "$1" = "minigpt4_7b" -o "$1" = "all" ]; then 108 | echo "downloading minigpt4_7b" 109 | filename='pretrained_minigpt4_7b.pth' 110 | wget ${public_server}/${filename} 111 | fi 112 | 113 | if [ "$1" = "minigpt4_13b" -o "$1" = "all" ]; then 114 | echo "downloading minigpt4_13b" 115 | filename='pretrained_minigpt4_13b.pth' 116 | wget ${public_server}/${filename} 117 | fi 118 | } 119 | 120 | main "$@" 121 | -------------------------------------------------------------------------------- /src/lmflow/utils/flash_attention/llama_flash_attention.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import torch 4 | from torch import nn 5 | 6 | import transformers 7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb 8 | 9 | from einops import rearrange 10 | 11 | from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func 12 | from flash_attn.bert_padding import unpad_input, pad_input 13 | 14 | 15 | def forward( 16 | self, 17 | hidden_states: torch.Tensor, 18 | attention_mask: Optional[torch.Tensor] = None, 19 | position_ids: Optional[torch.Tensor] = None, 20 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 21 | output_attentions: bool = False, 22 | use_cache: bool = False, 23 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: 24 | """Input shape: Batch x Time x Channel 25 | 26 | attention_mask: [bsz, q_len] 27 | """ 28 | bsz, q_len, _ = hidden_states.size() 29 | 30 | query_states = ( 31 | self.q_proj(hidden_states) 32 | .view(bsz, q_len, self.num_heads, self.head_dim) 33 | .transpose(1, 2) 34 | ) 35 | key_states = ( 36 | self.k_proj(hidden_states) 37 | .view(bsz, q_len, self.num_heads, self.head_dim) 38 | .transpose(1, 2) 39 | ) 40 | value_states = ( 41 | self.v_proj(hidden_states) 42 | .view(bsz, q_len, self.num_heads, self.head_dim) 43 | .transpose(1, 2) 44 | ) 45 | # [bsz, q_len, nh, hd] 46 | # [bsz, nh, q_len, hd] 47 | 48 | kv_seq_len = key_states.shape[-2] 49 | assert past_key_value is None, "past_key_value is not supported" 50 | 51 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) 52 | query_states, key_states = apply_rotary_pos_emb( 53 | query_states, key_states, cos, sin, position_ids 54 | ) 55 | # [bsz, nh, t, hd] 56 | assert not output_attentions, "output_attentions is not supported" 57 | assert not use_cache, "use_cache is not supported" 58 | 59 | # Flash attention codes from 60 | # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py 61 | 62 | # transform the data into the format required by flash attention 63 | qkv = torch.stack( 64 | [query_states, key_states, value_states], dim=2 65 | ) # [bsz, nh, 3, q_len, hd] 66 | qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd] 67 | # We have disabled _prepare_decoder_attention_mask in LlamaModel 68 | # the attention_mask should be the same as the key_padding_mask 69 | key_padding_mask = attention_mask 70 | 71 | if key_padding_mask is None: 72 | qkv = rearrange(qkv, "b s ... -> (b s) ...") 73 | max_s = q_len 74 | cu_q_lens = torch.arange( 75 | 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device 76 | ) 77 | output = flash_attn_unpadded_qkvpacked_func( 78 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 79 | ) 80 | output = rearrange(output, "(b s) ... -> b s ...", b=bsz) 81 | else: 82 | nheads = qkv.shape[-2] 83 | x = rearrange(qkv, "b s three h d -> b s (three h d)") 84 | x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask) 85 | x_unpad = rearrange( 86 | x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads 87 | ) 88 | output_unpad = flash_attn_unpadded_qkvpacked_func( 89 | x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 90 | ) 91 | output = rearrange( 92 | pad_input( 93 | rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len 94 | ), 95 | "b s (h d) -> b s h d", 96 | h=nheads, 97 | ) 98 | return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, None 99 | 100 | 101 | # Disable the transformation of the attention mask in LlamaModel as the flash attention 102 | # requires the attention mask to be the same as the key_padding_mask 103 | def _prepare_decoder_attention_mask( 104 | self, attention_mask, input_shape, inputs_embeds, past_key_values_length 105 | ): 106 | # [bsz, seq_len] 107 | return attention_mask 108 | 109 | 110 | def replace_llama_attn_with_flash_attn(): 111 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( 112 | _prepare_decoder_attention_mask 113 | ) 114 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward -------------------------------------------------------------------------------- /docs/source/_static/logo2.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lmflow/utils/constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """ 4 | Commonly used constants. 5 | """ 6 | 7 | TEXT_ONLY_DATASET_DESCRIPTION = ( 8 | """ 9 | "text_only": a dataset with only raw text instances, with following format: 10 | 11 | { 12 | "type": "text_only", 13 | "instances": [ 14 | { "text": "TEXT_1" }, 15 | { "text": "TEXT_2" }, 16 | ... 17 | ] 18 | } 19 | """ 20 | ).lstrip("\n") 21 | 22 | 23 | TEXT_ONLY_DATASET_DETAILS = ( 24 | """ 25 | For example, 26 | 27 | ```python 28 | from lmflow.datasets import Dataset 29 | 30 | data_dict = { 31 | "type": "text_only", 32 | "instances": [ 33 | { "text": "Human: Hello. Bot: Hi!" }, 34 | { "text": "Human: How are you today? Bot: Fine, thank you!" }, 35 | ] 36 | } 37 | dataset = Dataset.create_from_dict(data_dict) 38 | ``` 39 | 40 | You may also save the corresponding format to json, 41 | ```python 42 | import json 43 | from lmflow.args import DatasetArguments 44 | from lmflow.datasets import Dataset 45 | 46 | data_dict = { 47 | "type": "text_only", 48 | "instances": [ 49 | { "text": "Human: Hello. Bot: Hi!" }, 50 | { "text": "Human: How are you today? Bot: Fine, thank you!" }, 51 | ] 52 | } 53 | with open("data.json", "w") as fout: 54 | json.dump(data_dict, fout) 55 | 56 | data_args = DatasetArgument(dataset_path="data.json") 57 | dataset = Dataset(data_args) 58 | new_data_dict = dataset.to_dict() 59 | # `new_data_dict` Should have the same content as `data_dict` 60 | ``` 61 | """ 62 | ).lstrip("\n") 63 | 64 | 65 | TEXT2TEXT_DATASET_DESCRIPTION = ( 66 | """ 67 | "text2text": a dataset with input & output instances, with following format: 68 | 69 | { 70 | "type": "text2text", 71 | "instances": [ 72 | { "input": "INPUT_1", "output": "OUTPUT_1" }, 73 | { "input": "INPUT_2", "output": "OUTPUT_2" }, 74 | ... 75 | ] 76 | } 77 | """ 78 | ).lstrip("\n") 79 | 80 | 81 | TEXT2TEXT_DATASET_DETAILS = ( 82 | """ 83 | For example, 84 | 85 | ```python 86 | from lmflow.datasets import Dataset 87 | 88 | data_dict = { 89 | "type": "text2text", 90 | "instances": [ 91 | { 92 | "input": "Human: Hello.", 93 | "output": "Bot: Hi!", 94 | }, 95 | { 96 | "input": "Human: How are you today?", 97 | "output": "Bot: Fine, thank you! And you?", 98 | } 99 | ] 100 | } 101 | dataset = Dataset.create_from_dict(data_dict) 102 | ``` 103 | 104 | You may also save the corresponding format to json, 105 | ```python 106 | import json 107 | from lmflow.args import DatasetArguments 108 | from lmflow.datasets import Dataset 109 | 110 | data_dict = { 111 | "type": "text2text", 112 | "instances": [ 113 | { 114 | "input": "Human: Hello.", 115 | "output": "Bot: Hi!", 116 | }, 117 | { 118 | "input": "Human: How are you today?", 119 | "output": "Bot: Fine, thank you! And you?", 120 | } 121 | ] 122 | } 123 | with open("data.json", "w") as fout: 124 | json.dump(data_dict, fout) 125 | 126 | data_args = DatasetArgument(dataset_path="data.json") 127 | dataset = Dataset(data_args) 128 | new_data_dict = dataset.to_dict() 129 | # `new_data_dict` Should have the same content as `data_dict` 130 | ``` 131 | """ 132 | ).lstrip("\n") 133 | 134 | 135 | FLOAT_ONLY_DATASET_DESCRIPTION = ( 136 | """ 137 | "float_only": a dataset with only float instances, with following format: 138 | 139 | { 140 | "type": "float_only", 141 | "instances": [ 142 | { "value": "FLOAT_1" }, 143 | { "value": "FLOAT_2" }, 144 | ... 145 | ] 146 | } 147 | """ 148 | ).lstrip("\n") 149 | 150 | 151 | TEXT_ONLY_DATASET_LONG_DESCRITION = ( 152 | TEXT_ONLY_DATASET_DESCRIPTION + TEXT_ONLY_DATASET_DETAILS 153 | ) 154 | 155 | TEXT2TEXT_DATASET_LONG_DESCRITION = ( 156 | TEXT2TEXT_DATASET_DESCRIPTION + TEXT2TEXT_DATASET_DETAILS 157 | ) 158 | 159 | 160 | DATASET_DESCRIPTION_MAP = { 161 | "text_only": TEXT_ONLY_DATASET_DESCRIPTION, 162 | "text2text": TEXT2TEXT_DATASET_DESCRIPTION, 163 | "float_only": FLOAT_ONLY_DATASET_DESCRIPTION, 164 | } 165 | 166 | INSTANCE_FIELDS_MAP = { 167 | "text_only": ["text"], 168 | "text2text": ["input", "output"], 169 | "float_only": ["value"], 170 | "image_text": ["images", "text"], 171 | } 172 | -------------------------------------------------------------------------------- /tests/utils/test_data_utils.py: -------------------------------------------------------------------------------- 1 | #!/bin/env/python3 2 | # coding=utf-8 3 | from __future__ import absolute_import 4 | import unittest 5 | from lmflow.utils.data_utils import load_data, batchlize, answer_extraction 6 | from lmflow.args import DatasetArguments 7 | 8 | groundtruth_inputs = ['The Transformer architecture [START_REF]', 9 | 'The Schwarzschild radius is defined as: \\[', 10 | 'A force of 0.6N is applied to an object, which accelerates at 3m/s. What is its mass? ', 11 | '[START_I_SMILES]', 12 | '[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords', 13 | 'The reason why Transformers replaced RNNs was because', 14 | 'Question: What is the notch signaling pathway?\n\nAnswer:', 15 | '# Multi-Head Attention\n\n', 16 | 'Title: Self-Supervised Learning, A Survey\n\nAuthors: John Smith\n\n', 17 | 'Lecture 1: The Ising Model\n\n', 18 | 'Information overload is a major obstacle to scientific progress. The explosive growth in scientific literature and data has made it ever harder to discover useful insights in a large mass of information. Today scientific knowledge is accessed through search engines, but they are unable to organize scientific knowledge alone. In this paper we introduce Galactica: a large language model that can store, combine and reason about scientific knowledge. We train on a large scientific corpus of papers, reference material, knowledge bases and many other sources. We outperform existing models on a range of scientific tasks. On technical knowledge probes such as LaTeX equations, Galactica outperforms the latest GPT-3 by 68.2% versus 49.0%. Galactica also performs well on reasoning, outperforming Chinchilla on mathematical MMLU by 41.3% to 35.7%, and PaLM 540B on MATH with a score of 20.4% versus 8.8%. It also sets a new state-of-the-art on downstream tasks such as PubMedQA and MedMCQA dev of 77.6% and 52.9%. And despite not being trained on a general corpus, Galactica outperforms BLOOM and OPT-175B on BIG-bench. We believe these results demonstrate the potential for language models as a new interface for science. We open source the model for the benefit of the scientific community.\n\nTLDR:', 19 | '[START_I_SMILES]C(C(=O)O)N[END_I_SMILES]\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for', 20 | 'what is the capital of US?', 21 | ] 22 | 23 | groundtruth_outputs = ["NA"] * 13 24 | 25 | mc_output = ['Answer: (C) Generation of free radicals', 26 | 'Answer: C Generation of free radicals', 27 | 'Answer: C', 28 | 'Answer: (C)', 29 | 'A: C', 30 | 'A: (C)', 31 | 'Output: (C) Generation of free radicals', 32 | 'Output: C Generation of free radicals', 33 | 'Output: C', 34 | 'Output: (C)', 35 | ] 36 | 37 | mc_answer = ['c'] * 10 38 | 39 | qa_output = ['Yes.', 40 | 'Answer: Yes', 41 | 'Answer: Yes.', 42 | 'Yes ', 43 | 'No.', 44 | 'Answer: No', 45 | 'Answer: No.', 46 | 'No ', 47 | 'Maybe.', 48 | 'Answer: Maybe', 49 | 'Answer: Maybe.', 50 | 'Maybe ', 51 | ] 52 | qa_answer = ['yes'] * 4 + ['no'] * 4 + ['maybe'] * 4 53 | 54 | class DataUtilsTest(unittest.TestCase): 55 | def test_load_data(self): 56 | file_name = "data/example_dataset/test/test_13.json" 57 | 58 | inputs, outputs, datasize = load_data(file_name=file_name) 59 | # Test for inputs 60 | for i in range(0,len(inputs)): 61 | self.assertEqual(inputs[i], groundtruth_inputs[i]) 62 | # Test for outputs 63 | for i in range(0,len(outputs)): 64 | self.assertEqual(outputs[i], groundtruth_outputs[i]) 65 | # Test for datasize 66 | self.assertEqual(datasize, 13) 67 | 68 | def test_batchlize(self): 69 | file_name = "data/example_dataset/test/test_13.json" 70 | inputs, outputs, datasize = load_data(file_name=file_name) 71 | dataset = [] 72 | for idx in range(len(outputs)): 73 | dataset.append({"input":inputs[idx], "output":outputs[idx], "input_idx":idx}) 74 | # TODO: add test for random shuffle case 75 | dataloader = batchlize(dataset, 4, random_shuffle= False) 76 | self.assertEqual(len(dataloader), 13 // 4 + 1) 77 | 78 | def test_answer_extraction(self): 79 | # Test for medmcqa dataset 80 | for i in range(0,len(mc_output)): 81 | self.assertEqual(answer_extraction(mc_output[i], answer_type="medmcqa"), mc_answer[i]) 82 | # Test for usmle dataset 83 | for i in range(0,len(mc_output)): 84 | self.assertEqual(answer_extraction(mc_output[i], answer_type="usmle"), mc_answer[i]) 85 | # Test for pubmedqa dataset 86 | for i in range(0,len(qa_output)): 87 | self.assertEqual(answer_extraction(qa_output[i], answer_type="pubmedqa"), qa_answer[i]) 88 | -------------------------------------------------------------------------------- /examples/chatbot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """A simple shell chatbot implemented with lmflow APIs. 5 | """ 6 | import logging 7 | import json 8 | import os 9 | import sys 10 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0]))) 11 | import warnings 12 | 13 | from dataclasses import dataclass, field 14 | from transformers import HfArgumentParser 15 | from typing import Optional 16 | 17 | from lmflow.datasets.dataset import Dataset 18 | from lmflow.pipeline.auto_pipeline import AutoPipeline 19 | from lmflow.models.auto_model import AutoModel 20 | from lmflow.args import ModelArguments, DatasetArguments, AutoArguments 21 | 22 | 23 | logging.disable(logging.ERROR) 24 | warnings.filterwarnings("ignore") 25 | 26 | 27 | @dataclass 28 | class ChatbotArguments: 29 | prompt_structure: Optional[str] = field( 30 | default="{input_text}", 31 | metadata={ 32 | "help": "prompt structure given user's input text" 33 | }, 34 | ) 35 | end_string: Optional[str] = field( 36 | default="\n\n", 37 | metadata={ 38 | "help": "end string mark of the chatbot's output" 39 | }, 40 | ) 41 | 42 | def main(): 43 | pipeline_name = "inferencer" 44 | PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name) 45 | 46 | parser = HfArgumentParser(( 47 | ModelArguments, 48 | PipelineArguments, 49 | ChatbotArguments, 50 | )) 51 | model_args, pipeline_args, chatbot_args = ( 52 | parser.parse_args_into_dataclasses() 53 | ) 54 | inferencer_args = pipeline_args 55 | 56 | with open (pipeline_args.deepspeed, "r") as f: 57 | ds_config = json.load(f) 58 | 59 | model = AutoModel.get_model( 60 | model_args, 61 | tune_strategy='none', 62 | ds_config=ds_config, 63 | device=pipeline_args.device, 64 | ) 65 | 66 | # We don't need input data, we will read interactively from stdin 67 | data_args = DatasetArguments(dataset_path=None) 68 | dataset = Dataset(data_args) 69 | 70 | inferencer = AutoPipeline.get_pipeline( 71 | pipeline_name=pipeline_name, 72 | model_args=model_args, 73 | data_args=data_args, 74 | pipeline_args=pipeline_args, 75 | ) 76 | 77 | # Chats 78 | model_name = model_args.model_name_or_path 79 | if model_args.lora_model_path is not None: 80 | model_name += f" + {model_args.lora_model_path}" 81 | 82 | guide_message = ( 83 | "\n" 84 | f"#############################################################################\n" 85 | f"## A {model_name} chatbot is now chatting with you!\n" 86 | f"#############################################################################\n" 87 | "\n" 88 | ) 89 | print(guide_message) 90 | 91 | # context = ( 92 | # "You are a helpful assistant who follows the given instructions" 93 | # " unconditionally." 94 | # ) 95 | context = "" 96 | 97 | end_string = chatbot_args.end_string 98 | prompt_structure = chatbot_args.prompt_structure 99 | 100 | while True: 101 | input_text = input("User >>> ") 102 | if input_text == "exit": 103 | print("exit...") 104 | break 105 | elif input_text == "reset": 106 | context = "" 107 | print("Chat history cleared") 108 | continue 109 | if not input_text: 110 | input_text = " " 111 | 112 | context += prompt_structure.format(input_text=input_text) 113 | context = context[-model.get_max_length():] # Memory of the bot 114 | 115 | input_dataset = dataset.from_dict({ 116 | "type": "text_only", 117 | "instances": [ { "text": context } ] 118 | }) 119 | 120 | print("Bot: ", end="") 121 | print_index = 0 122 | 123 | token_per_step = 4 124 | 125 | for response, flag_break in inferencer.stream_inference( 126 | context=context, 127 | model=model, 128 | max_new_tokens=inferencer_args.max_new_tokens, 129 | token_per_step=token_per_step, 130 | temperature=inferencer_args.temperature, 131 | end_string=end_string, 132 | input_dataset=input_dataset 133 | ): 134 | # Prints characters in the buffer 135 | new_print_index = print_index 136 | for char in response[print_index:]: 137 | if end_string is not None and char == end_string[0]: 138 | if new_print_index + len(end_string) >= len(response): 139 | break 140 | 141 | new_print_index += 1 142 | print(char, end="", flush=True) 143 | 144 | print_index = new_print_index 145 | 146 | if flag_break: 147 | break 148 | print("\n", end="") 149 | 150 | context += response + "\n" 151 | 152 | 153 | if __name__ == "__main__": 154 | main() 155 | -------------------------------------------------------------------------------- /docs/source/_static/logo3.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /service/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | import os 4 | 5 | from flask import Flask, request, stream_with_context 6 | from flask import render_template 7 | from flask_cors import CORS 8 | from accelerate import Accelerator 9 | from dataclasses import dataclass, field 10 | from transformers import HfArgumentParser 11 | from typing import Optional 12 | 13 | from lmflow.datasets.dataset import Dataset 14 | from lmflow.pipeline.auto_pipeline import AutoPipeline 15 | from lmflow.models.auto_model import AutoModel 16 | from lmflow.args import ModelArguments, DatasetArguments, AutoArguments 17 | 18 | WINDOW_LENGTH = 512 19 | 20 | @dataclass 21 | class AppArguments: 22 | end_string: Optional[str] = field( 23 | default="##", 24 | metadata={ 25 | "help": "end string mark of the chatbot's output" 26 | }, 27 | ) 28 | max_new_tokens: Optional[int] = field( 29 | default=200, 30 | metadata={ 31 | "help": "maximum number of generated tokens" 32 | }, 33 | ) 34 | 35 | parser = HfArgumentParser(( 36 | ModelArguments, 37 | AppArguments, 38 | )) 39 | 40 | model_args, app_args = ( 41 | parser.parse_args_into_dataclasses() 42 | ) 43 | 44 | app = Flask(__name__) 45 | CORS(app) 46 | ds_config_path = "./examples/ds_config.json" 47 | with open (ds_config_path, "r") as f: 48 | ds_config = json.load(f) 49 | 50 | 51 | local_rank = int(os.getenv("LOCAL_RANK", "0")) 52 | world_size = int(os.getenv("WORLD_SIZE", "1")) 53 | torch.cuda.set_device(local_rank) 54 | model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config, use_accelerator=True) 55 | accelerator = Accelerator() 56 | 57 | def stream_generate(inputs,context_len = 1024, max_new_tokens=128, end_string="##"): 58 | 59 | 60 | max_src_len = context_len - max_new_tokens - len(end_string) 61 | input_ids = model.tokenizer(inputs).input_ids 62 | input_echo_len = len(input_ids) 63 | output_ids = list(input_ids) 64 | input_ids = input_ids[-max_src_len:] 65 | 66 | past_key_values = out = None 67 | flag_stop = False 68 | for i in range(0, max_new_tokens): 69 | with accelerator.autocast(): 70 | if i == 0: 71 | with torch.no_grad(): 72 | out = model.backend_model(torch.as_tensor([input_ids], device=local_rank), use_cache=True) 73 | logits = out.logits 74 | past_key_values = out.past_key_values 75 | else: 76 | with torch.no_grad(): 77 | out = model.backend_model( 78 | input_ids=torch.as_tensor([[token]], device=local_rank), 79 | use_cache=True, 80 | past_key_values=past_key_values, 81 | ) 82 | logits = out.logits 83 | past_key_values = out.past_key_values 84 | 85 | last_token_logits = logits[0, -1, :] 86 | token = int(torch.argmax(last_token_logits)) 87 | output_ids.append(token) 88 | 89 | tmp_output_ids = output_ids[input_echo_len:] 90 | 91 | output = model.tokenizer.decode( 92 | tmp_output_ids, 93 | skip_special_tokens=True, 94 | spaces_between_special_tokens=False, 95 | ) 96 | 97 | if end_string in output: 98 | index = output.index(end_string) 99 | output = output[:index] 100 | flag_stop = True 101 | yield output.replace("\ufffd","") 102 | 103 | if flag_stop == True: 104 | break 105 | 106 | @app.route('/predict',methods = ['POST']) 107 | def predict(): 108 | if(request.method == "POST"): 109 | try: 110 | user_input = request.get_json()["Input"] 111 | conversation = request.get_json()["History"] 112 | 113 | history_input = "" 114 | if(len(conversation) >= 2): 115 | if(len(conversation) == 2): 116 | history_input ="###Human: " + user_input +" " 117 | else: 118 | for i in range(0, len(conversation)-1): 119 | if(i % 2 == 0): 120 | history_input = history_input + "###Human: " + conversation[i+1]["content"] + " " 121 | elif(i % 2 == 1): 122 | history_input = history_input + "###Assistant:" + conversation[i+1]["content"] 123 | history_input = history_input + "###Assistant:" 124 | 125 | if len(model.encode(history_input))> WINDOW_LENGTH: 126 | inputs = model.encode(history_input) 127 | inputs = inputs[-WINDOW_LENGTH:] 128 | history_input = model.decode(inputs) 129 | 130 | return app.response_class(stream_with_context(stream_generate(history_input, 131 | max_new_tokens=app_args.max_new_tokens, 132 | end_string=app_args.end_string))) 133 | except Exception as ex: 134 | print(ex) 135 | text_out = ex 136 | else: 137 | text_out = "Not POST Method" 138 | return text_out 139 | 140 | @app.route('/',methods = ['GET']) 141 | def login(): 142 | 143 | return render_template('index.html') 144 | 145 | 146 | app.run(port = 5000, debug = False) 147 | -------------------------------------------------------------------------------- /scripts/export_llama_state_dict_checkpoint.py: -------------------------------------------------------------------------------- 1 | # Export state dict for downstream inference, such as llama.cpp 2 | 3 | import json 4 | import os 5 | 6 | import torch 7 | import transformers 8 | from peft import PeftModel 9 | from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: E402 10 | 11 | 12 | def permute(w): 13 | return ( 14 | w.view(n_heads, dim // n_heads // 2, 2, dim) 15 | .transpose(1, 2) 16 | .reshape(dim, dim) 17 | ) 18 | 19 | 20 | def unpermute(w): 21 | return ( 22 | w.view(n_heads, 2, dim // n_heads // 2, dim) 23 | .transpose(1, 2) 24 | .reshape(dim, dim) 25 | ) 26 | 27 | def translate_state_dict_key(k): # noqa: C901 28 | k = k.replace("base_model.model.", "") 29 | if k == "model.embed_tokens.weight": 30 | return "tok_embeddings.weight" 31 | elif k == "model.norm.weight": 32 | return "norm.weight" 33 | elif k == "lm_head.weight": 34 | return "output.weight" 35 | elif k.startswith("model.layers."): 36 | layer = k.split(".")[2] 37 | if k.endswith(".self_attn.q_proj.weight"): 38 | return f"layers.{layer}.attention.wq.weight" 39 | elif k.endswith(".self_attn.k_proj.weight"): 40 | return f"layers.{layer}.attention.wk.weight" 41 | elif k.endswith(".self_attn.v_proj.weight"): 42 | return f"layers.{layer}.attention.wv.weight" 43 | elif k.endswith(".self_attn.o_proj.weight"): 44 | return f"layers.{layer}.attention.wo.weight" 45 | elif k.endswith(".mlp.gate_proj.weight"): 46 | return f"layers.{layer}.feed_forward.w1.weight" 47 | elif k.endswith(".mlp.down_proj.weight"): 48 | return f"layers.{layer}.feed_forward.w2.weight" 49 | elif k.endswith(".mlp.up_proj.weight"): 50 | return f"layers.{layer}.feed_forward.w3.weight" 51 | elif k.endswith(".input_layernorm.weight"): 52 | return f"layers.{layer}.attention_norm.weight" 53 | elif k.endswith(".post_attention_layernorm.weight"): 54 | return f"layers.{layer}.ffn_norm.weight" 55 | elif k.endswith("rotary_emb.inv_freq") or "lora" in k: 56 | return None 57 | else: 58 | print(layer, k) 59 | raise NotImplementedError 60 | else: 61 | print(k) 62 | raise NotImplementedError 63 | 64 | PARAM_LIST = { 65 | 7:{ 66 | "dim": 4096, 67 | "multiple_of": 256, 68 | "n_heads": 32, 69 | "n_layers": 32, 70 | "norm_eps": 1e-06, 71 | "vocab_size": -1, 72 | }, 73 | 13:{ 74 | "dim": 5120, 75 | "multiple_of": 256, 76 | "n_heads": 40, 77 | "n_layers": 40, 78 | "norm_eps": 1e-06, 79 | "vocab_size": -1, 80 | }, 81 | 33:{ 82 | "dim": 6656, 83 | "multiple_of": 256, 84 | "n_heads": 52, 85 | "n_layers": 60, 86 | "norm_eps": 1e-06, 87 | "vocab_size": -1, 88 | }} 89 | 90 | 91 | BASE_MODEL = os.environ.get("BASE_MODEL", None) 92 | assert ( 93 | BASE_MODEL 94 | ), "Please specify a value for BASE_MODEL environment variable, e.g. `export BASE_MODEL=decapoda-research/llama-30b-hf`" # noqa: E501 95 | LORA_MODEL = os.environ.get("LORA_MODEL", None) 96 | 97 | MODEL_SIZE = int(os.environ.get("MODEL_SIZE", None)) 98 | assert ( 99 | MODEL_SIZE 100 | ), "Please specify a value for MODEL_SIZE environment variable, e.g. `export MODEL_SIZE=33`" # noqa: E501 101 | 102 | 103 | tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL) 104 | 105 | base_model = LlamaForCausalLM.from_pretrained( 106 | BASE_MODEL, 107 | load_in_8bit=False, 108 | torch_dtype=torch.float16, 109 | device_map={"": "cpu"}, 110 | ) 111 | 112 | 113 | params = PARAM_LIST[MODEL_SIZE] 114 | 115 | n_layers = params["n_layers"] 116 | n_heads = params["n_heads"] 117 | dim = params["dim"] 118 | dims_per_head = dim // n_heads 119 | base = 10000.0 120 | inv_freq = 1.0 / ( 121 | base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head) 122 | ) 123 | 124 | if not (LORA_MODEL is None): 125 | lora_model = PeftModel.from_pretrained( 126 | base_model, 127 | LORA_MODEL, 128 | device_map={"": "cpu"}, 129 | torch_dtype=torch.float16, 130 | ) 131 | 132 | 133 | # merge weights 134 | for layer in lora_model.base_model.model.model.layers: 135 | layer.self_attn.q_proj.merge_weights = True 136 | layer.self_attn.v_proj.merge_weights = True 137 | 138 | lora_model.train(False) 139 | 140 | lora_model_sd = lora_model.state_dict() 141 | 142 | 143 | 144 | 145 | 146 | 147 | new_state_dict = {} 148 | for k, v in lora_model_sd.items(): 149 | new_k = translate_state_dict_key(k) 150 | if new_k is not None: 151 | if "wq" in new_k or "wk" in new_k: 152 | new_state_dict[new_k] = unpermute(v) 153 | else: 154 | new_state_dict[new_k] = v 155 | else: 156 | base_model.eval() 157 | new_state_dict = {} 158 | state_dicts = base_model.state_dict() 159 | for k, v in state_dicts.items(): 160 | new_k = translate_state_dict_key(k) 161 | if new_k is not None: 162 | if "wq" in new_k or "wk" in new_k: 163 | new_state_dict[new_k] = unpermute(v) 164 | else: 165 | new_state_dict[new_k] = v 166 | 167 | 168 | 169 | os.makedirs("./ckpt", exist_ok=True) 170 | 171 | torch.save(new_state_dict, "./ckpt/consolidated.00.pth") 172 | 173 | with open("./ckpt/params.json", "w") as f: 174 | json.dump(params, f) 175 | -------------------------------------------------------------------------------- /examples/raft_align.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """Alignment tuning example, such as RLHF.""" 5 | 6 | import logging 7 | import os 8 | import sys 9 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0]))) 10 | from dataclasses import dataclass, field 11 | from typing import Optional 12 | 13 | from transformers import HfArgumentParser, pipeline, AutoTokenizer 14 | 15 | from lmflow.args import ( 16 | ModelArguments, 17 | DatasetArguments, 18 | AutoArguments, 19 | ) 20 | 21 | from lmflow.datasets.dataset import Dataset 22 | from lmflow.models.auto_model import AutoModel 23 | from lmflow.pipeline.auto_pipeline import AutoPipeline 24 | 25 | 26 | @dataclass 27 | class RewardArguments: 28 | reward_type: Optional[str] = field( 29 | default="hf_pipeline", 30 | metadata={ 31 | "help": ( 32 | "type of reward model, support huggingface pipeline. Will" 33 | " support \"customized\" torch.nn.modules in the future." 34 | ), 35 | }, 36 | ) 37 | reward_model_or_path: Optional[str] = field( 38 | default="weqweasdas/hh_rlhf_rm", 39 | metadata={ 40 | "help": ( 41 | "reward model name (huggingface) or its path" 42 | ), 43 | }, 44 | ) 45 | reward_task: Optional[str] = field( 46 | default="sentiment-analysis", 47 | metadata={ 48 | "help": "type of reward task, such as sentiment-analysis, detoxic." 49 | }, 50 | ) 51 | reward_model_args: Optional[str] = field( 52 | default="return_all_scores=True, function_to_apply=\"none\", batch_size=1", 53 | metadata={ 54 | "help": ( 55 | "extra arguments required by different type of reward models." 56 | ), 57 | }, 58 | ) 59 | 60 | 61 | def get_reward_function(reward_args, pipeline_args): 62 | args = reward_args 63 | reward_type = args.reward_type 64 | 65 | if reward_type == "hf_pipeline": 66 | 67 | # GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token. 68 | # only for this model. 69 | rm_tokenizer = AutoTokenizer.from_pretrained(reward_args.reward_model_or_path) 70 | rm_tokenizer.pad_token = rm_tokenizer.eos_token 71 | rm_tokenizer.pad_token_id = rm_tokenizer.eos_token_id 72 | rm_tokenizer.padding_side = "left" 73 | 74 | hf_pipe = pipeline( 75 | reward_args.reward_task, 76 | model=reward_args.reward_model_or_path, 77 | device=f"cuda:{pipeline_args.local_rank}", 78 | tokenizer=rm_tokenizer 79 | ) 80 | def reward_func(dataset: Dataset): 81 | if dataset.type != "text_only": 82 | raise NotImplementedError( 83 | "reward function only accept \"text_only\" datasets" 84 | ) 85 | pipe_kwargs = { 86 | "return_all_scores": True, 87 | "function_to_apply": "none", 88 | "batch_size": 1 89 | } 90 | 91 | data_dict = dataset.to_dict() 92 | texts_for_rewards = [ 93 | sample["text"] for sample in data_dict["instances"] 94 | ] 95 | pipe_outputs = hf_pipe(texts_for_rewards, **pipe_kwargs) 96 | rewards = [output[0]["score"] for output in pipe_outputs] 97 | 98 | reward_dataset = Dataset.create_from_dict({ 99 | "type": "float_only", 100 | "instances": [ 101 | { "value": reward } for reward in rewards 102 | ] 103 | }) 104 | return reward_dataset 105 | 106 | return reward_func 107 | else: 108 | raise NotImplementedError("unsupported reward type \"{reward_type}\"") 109 | 110 | 111 | def main(): 112 | # Parses arguments 113 | pipeline_name = "raft_aligner" 114 | PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name) 115 | 116 | parser = HfArgumentParser(( 117 | ModelArguments, 118 | DatasetArguments, 119 | PipelineArguments, 120 | RewardArguments, 121 | )) 122 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 123 | model_args, data_args, pipeline_args, reward_args = parser.parse_json_file( 124 | json_file=os.path.abspath(sys.argv[1]) 125 | ) 126 | else: 127 | model_args, data_args, pipeline_args, reward_args = parser.parse_args_into_dataclasses() 128 | 129 | # Initializes pipeline, dataset and model for reward training 130 | aligner = AutoPipeline.get_pipeline( 131 | pipeline_name=pipeline_name, 132 | model_args=model_args, 133 | data_args=data_args, 134 | pipeline_args=pipeline_args, 135 | ) 136 | dataset = Dataset(data_args) 137 | model = AutoModel.get_model(model_args) 138 | 139 | # Initializes reward function 140 | reward_function = get_reward_function(reward_args, pipeline_args) 141 | 142 | reward_model_args = ModelArguments(arch_type="text_regression") 143 | reward_model = AutoModel.get_model(reward_model_args) 144 | reward_model.register_inference_function(reward_function) 145 | 146 | # Aligns model with rewards 147 | aligned_model = aligner.align( 148 | model=model, 149 | dataset=dataset, 150 | reward_model=reward_model, 151 | ) 152 | 153 | 154 | if __name__ == '__main__': 155 | main() -------------------------------------------------------------------------------- /scripts/data_preprocess/concat_shuffle_split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved. 4 | """ 5 | This script is designed for handling large datasets. 6 | It merges multiple datasets located in the same directory, shuffles them, and splits them into training, evaluation, and testing sets. 7 | The training set is further divided into 10 folds. 8 | """ 9 | from __future__ import absolute_import 10 | 11 | import argparse 12 | import json 13 | import textwrap 14 | import sys 15 | import os 16 | import random 17 | import gc 18 | 19 | def parse_argument(sys_argv): 20 | """Parses arguments from command line. 21 | Args: 22 | sys_argv: the list of arguments (strings) from command line. 23 | Returns: 24 | A struct whose member corresponds to the required (optional) variable. 25 | For example, 26 | ``` 27 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10']) 28 | args.input # 'a.txt' 29 | args.num # 10 30 | ``` 31 | """ 32 | parser = argparse.ArgumentParser( 33 | formatter_class=argparse.RawTextHelpFormatter) 34 | 35 | # Training parameters 36 | parser.add_argument( 37 | "--output_path", type=str, 38 | default=None, 39 | help=textwrap.dedent("output dataset path, writes to stdout by default") 40 | ) 41 | parser.add_argument( 42 | "--merge_from_path", type=str, 43 | nargs="+", 44 | help=textwrap.dedent( 45 | "dataset path of the extra dataset that will be merged" 46 | " into input dataset" 47 | ) 48 | ) 49 | parser.add_argument( 50 | "--seed", type=int, default=42, 51 | help=textwrap.dedent("pseudorandom seed") 52 | ) 53 | parser.add_argument( 54 | "--eval_size", type=int, default=200, 55 | help=textwrap.dedent("size of eval dataset") 56 | ) 57 | parser.add_argument( 58 | "--test_size", type=int, default=1000, 59 | help=textwrap.dedent("size of test dataset") 60 | ) 61 | parser.add_argument( 62 | "--k", type=int, default=10, 63 | help=textwrap.dedent("the train dataset will be divide into k folds") 64 | ) 65 | # Parses from commandline 66 | args = parser.parse_args(sys_argv[1:]) 67 | 68 | return args 69 | 70 | 71 | def main(): 72 | args = parse_argument(sys.argv) 73 | 74 | # concat 75 | if args.merge_from_path is not None: 76 | for i in range(0, len(args.merge_from_path)): 77 | with open(args.merge_from_path[i], "r") as fin: 78 | extra_data_dict = json.load(fin) 79 | if i == 0: 80 | data_dict = extra_data_dict 81 | else: 82 | if data_dict["type"] != extra_data_dict["type"]: 83 | raise ValueError( 84 | 'two dataset have different types:' 85 | f' input dataset: "{data_dict["type"]}";' 86 | f' merge from dataset: "{extra_data_dict["type"]}"' 87 | ) 88 | data_dict["instances"].extend(extra_data_dict["instances"]) 89 | else: 90 | raise ValueError("No merge files specified") 91 | del extra_data_dict 92 | gc.collect() 93 | print('finish concat') 94 | 95 | # shuffle 96 | random.seed(args.seed) 97 | random.shuffle(data_dict["instances"]) 98 | print('finish shuffle') 99 | # split to train, eval, test 100 | train_data_dict = {"type":data_dict["type"],"instances":data_dict["instances"][args.eval_size:-args.test_size]} 101 | eval_data_dict = {"type":data_dict["type"],"instances":data_dict["instances"][:args.eval_size]} 102 | test_data_dict = {"type":data_dict["type"],"instances":data_dict["instances"][-args.test_size:]} 103 | del data_dict 104 | gc.collect() 105 | 106 | # divide train in 10 folds 107 | num_instances = len(train_data_dict["instances"]) 108 | split_size = num_instances // args.k 109 | split_data = [] 110 | for i in range(args.k): 111 | if i < args.k-1: 112 | split = train_data_dict["instances"][i*split_size : (i+1)*split_size] 113 | else: 114 | # Last split may have remaining instances 115 | split = train_data_dict["instances"][i*split_size:] 116 | split_data.append({'type': train_data_dict["type"], 'instances': split}) 117 | 118 | del train_data_dict 119 | gc.collect() 120 | 121 | print('finish split') 122 | # save dataset under output_path 123 | 124 | if args.output_path is None: 125 | args.output_path = sys.stdout 126 | 127 | train_save_path=os.path.join(args.output_path,"train_{k}_folds".format(k=args.k)) 128 | if not os.path.exists(train_save_path): 129 | os.makedirs(train_save_path) 130 | for i in range(args.k): 131 | with open(train_save_path+"/train_"+str(i)+".json", 'w') as f: 132 | json.dump(split_data[i], f, indent=4, ensure_ascii=False) 133 | 134 | eval_save_path=os.path.join(args.output_path,"eval") 135 | if not os.path.exists(eval_save_path): 136 | os.makedirs(eval_save_path) 137 | with open(eval_save_path+'/eval.json','w') as f: 138 | json.dump(eval_data_dict,f,indent=4,ensure_ascii=False) 139 | 140 | test_save_path=os.path.join(args.output_path,"test") 141 | if not os.path.exists(test_save_path): 142 | os.makedirs(test_save_path) 143 | with open(test_save_path+'/test.json','w') as f: 144 | json.dump(test_data_dict,f,indent=4,ensure_ascii=False) 145 | 146 | 147 | 148 | if __name__ == "__main__": 149 | main() 150 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | LMFLow. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | --------------------------------------------------------------------------------