├── tests
├── __init__.py
├── datasets
│ ├── __init__.py
│ └── test_dataset.py
├── models
│ ├── __init__.py
│ └── test_auto_model.py
├── utils
│ ├── __init__.py
│ └── test_data_utils.py
└── pipeline
│ └── test_auto_pipeline.py
├── src
└── lmflow
│ ├── utils
│ ├── __init__.py
│ ├── flash_attention
│ │ ├── __init__.py
│ │ ├── gpt_neo_flash_attention.py
│ │ ├── bloom_flash_attention.py
│ │ └── llama_flash_attention.py
│ ├── position_interpolation
│ │ ├── __init__.py
│ │ └── llama_rope_scaled_monkey_patch.py
│ └── constants.py
│ ├── models
│ ├── __init__.py
│ ├── interfaces
│ │ ├── __init__.py
│ │ └── tunable.py
│ ├── base_model.py
│ ├── regression_model.py
│ ├── decoder_model.py
│ ├── encoder_decoder_model.py
│ ├── auto_model.py
│ └── text_regression_model.py
│ ├── pipeline
│ ├── __init__.py
│ ├── utils
│ │ ├── __init__.py
│ │ └── peft_trainer.py
│ ├── base_pipeline.py
│ ├── base_tuner.py
│ ├── base_aligner.py
│ └── auto_pipeline.py
│ ├── version.py
│ ├── datasets
│ └── __init__.py
│ └── __init__.py
├── scripts
├── bash.sh
├── run_unittest.sh
├── run_app.sh
├── vocab_extension
│ ├── convert_json_to_txt.sh
│ ├── merge_tokenizer.sh
│ ├── train_tokenizer.sh
│ └── train_merge_tokenizer.sh
├── run_evaluation.sh
├── run_vis_chatbot_blip2.sh
├── run_chatbot.sh
├── run_chatbot_cpu.sh
├── run_chatbot_chatglm.sh
├── run_inference_multimodal_model.sh
├── run_evaluation_accelerator.sh
├── run_evaluation_with_lora.sh
├── run_benchmark.sh
├── run_vis_chatbot_minigpt4.sh
├── .nfs0000000094418362000004c4
├── run_finetune.sh
├── run_finetune_with_lora.sh
├── run_multistage_finetune.sh
├── run_reward_modeling.sh
├── run_finetune_with_lora_save_aggregated_weights.sh
├── run_raft_align.sh
├── data_preprocess
│ ├── run_data_preprocess.sh
│ ├── count.py
│ ├── shuffle.py
│ ├── sample.py
│ ├── add_prompt.py
│ ├── concat.py
│ ├── add_end_mark.py
│ ├── merge.py
│ └── concat_shuffle_split.py
├── run_all_benchmark.sh
├── run_vis_chatbot_gradio_minigpt4.sh
└── export_llama_state_dict_checkpoint.py
├── assets
├── logo.png
├── features.png
├── robin13b.png
├── robin33b.png
├── robin65b.png
├── robin7b.jpg
├── robin7b_.png
├── Cockatoo3b.png
├── Cockatoo7b.png
├── Parakeets.png
├── robin13b_.jpg
├── robin33b_.png
├── robin65b_.png
├── colab-shell-chatbot-demo.png
└── multimodal-chatbot-demo.gif
├── docs
├── source
│ ├── _static
│ │ ├── eq.png
│ │ ├── logo.png
│ │ ├── nll.png
│ │ ├── ppl.png
│ │ ├── raft.png
│ │ ├── IT_sample1.png
│ │ ├── IT_sample2.png
│ │ ├── IT_sample3.png
│ │ ├── IT_sample4.png
│ │ ├── IT_sample5.png
│ │ ├── IT_sample6.png
│ │ ├── IT_sample7.png
│ │ ├── raft_idea.PNG
│ │ ├── benchmark-1.png
│ │ ├── benchmark-2.png
│ │ ├── raft_reward.PNG
│ │ ├── raft-demo-examples.png
│ │ ├── logo5.svg
│ │ ├── logo4.svg
│ │ ├── logo6.svg
│ │ ├── logo.svg
│ │ ├── logo2.svg
│ │ └── logo3.svg
│ ├── about
│ │ ├── authors.md
│ │ ├── index.md
│ │ └── changelog.md
│ ├── blogs
│ │ └── index.md
│ ├── api
│ │ └── _autosummary
│ │ │ └── lmflow.args.rst
│ ├── examples
│ │ ├── index.md
│ │ ├── checkpoints.md
│ │ ├── medical_finetune.md
│ │ ├── DATASETS.md
│ │ └── TASK_GUIDE.md
│ └── conf.py
└── requirements.txt
├── service
├── static
│ └── assets
│ │ ├── logo.png
│ │ └── background.png
└── app.py
├── .github
├── ISSUE_TEMPLATE
│ ├── blank-template.md
│ ├── api-feedback.md
│ ├── feature-request.md
│ └── bug-report.md
└── workflows
│ └── documentation.yaml
├── .gitattributes
├── examples
├── ds_config.json
├── merge_lora.py
├── evaluation.py
├── finetune.py
├── chatbot.py
└── raft_align.py
├── configs
├── ds_config_eval.json
├── ds_config_chatbot.json
├── ds_config_multimodal.json
├── accelerator_singlegpu_config.yaml
├── accelerator_multigpu_config.yaml
├── ds_config_zero3_for_eval.json
├── ds_config_zero2.json
└── ds_config_zero3.json
├── docker
├── Dockerfile
└── README.md
├── requirements.txt
├── CONTRIBUTING.md
├── utils
├── train_tokenizer.py
├── convert_minigpt4_checkpoints.py
├── convert_json_to_txt.py
├── make_delta.py
├── merge_tokenizer.py
└── lm_evaluator.py
├── setup.py
├── experimental
└── RAFT-diffusion
│ ├── requirements.txt
│ └── README.md
├── .gitignore
├── output_models
└── download.sh
└── CODE_OF_CONDUCT.md
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/lmflow/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/src/lmflow/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/lmflow/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/lmflow/pipeline/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/lmflow/models/interfaces/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/lmflow/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
--------------------------------------------------------------------------------
/src/lmflow/utils/flash_attention/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/lmflow/utils/position_interpolation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scripts/bash.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Shell and python scripts goes here
--------------------------------------------------------------------------------
/scripts/run_unittest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m unittest discover
4 |
--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/logo.png
--------------------------------------------------------------------------------
/assets/features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/features.png
--------------------------------------------------------------------------------
/assets/robin13b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin13b.png
--------------------------------------------------------------------------------
/assets/robin33b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin33b.png
--------------------------------------------------------------------------------
/assets/robin65b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin65b.png
--------------------------------------------------------------------------------
/assets/robin7b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin7b.jpg
--------------------------------------------------------------------------------
/assets/robin7b_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin7b_.png
--------------------------------------------------------------------------------
/assets/Cockatoo3b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/Cockatoo3b.png
--------------------------------------------------------------------------------
/assets/Cockatoo7b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/Cockatoo7b.png
--------------------------------------------------------------------------------
/assets/Parakeets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/Parakeets.png
--------------------------------------------------------------------------------
/assets/robin13b_.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin13b_.jpg
--------------------------------------------------------------------------------
/assets/robin33b_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin33b_.png
--------------------------------------------------------------------------------
/assets/robin65b_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/robin65b_.png
--------------------------------------------------------------------------------
/docs/source/_static/eq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/eq.png
--------------------------------------------------------------------------------
/docs/source/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/logo.png
--------------------------------------------------------------------------------
/docs/source/_static/nll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/nll.png
--------------------------------------------------------------------------------
/docs/source/_static/ppl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/ppl.png
--------------------------------------------------------------------------------
/docs/source/_static/raft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/raft.png
--------------------------------------------------------------------------------
/service/static/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/service/static/assets/logo.png
--------------------------------------------------------------------------------
/docs/source/_static/IT_sample1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample1.png
--------------------------------------------------------------------------------
/docs/source/_static/IT_sample2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample2.png
--------------------------------------------------------------------------------
/docs/source/_static/IT_sample3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample3.png
--------------------------------------------------------------------------------
/docs/source/_static/IT_sample4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample4.png
--------------------------------------------------------------------------------
/docs/source/_static/IT_sample5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample5.png
--------------------------------------------------------------------------------
/docs/source/_static/IT_sample6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample6.png
--------------------------------------------------------------------------------
/docs/source/_static/IT_sample7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/IT_sample7.png
--------------------------------------------------------------------------------
/docs/source/_static/raft_idea.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/raft_idea.PNG
--------------------------------------------------------------------------------
/assets/colab-shell-chatbot-demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/assets/colab-shell-chatbot-demo.png
--------------------------------------------------------------------------------
/docs/source/_static/benchmark-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/benchmark-1.png
--------------------------------------------------------------------------------
/docs/source/_static/benchmark-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/benchmark-2.png
--------------------------------------------------------------------------------
/docs/source/_static/raft_reward.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/raft_reward.PNG
--------------------------------------------------------------------------------
/service/static/assets/background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/service/static/assets/background.png
--------------------------------------------------------------------------------
/docs/source/_static/raft-demo-examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpaviot/LMFlow/main/docs/source/_static/raft-demo-examples.png
--------------------------------------------------------------------------------
/docs/source/about/authors.md:
--------------------------------------------------------------------------------
1 | # Contributors
2 |
3 |
4 | Shizhe Diao, Rui Pan, Hanze Dong, Ka Shun Shum, Jipeng Zhang, Wei Xiong, Tong Zhang
5 |
--------------------------------------------------------------------------------
/docs/source/blogs/index.md:
--------------------------------------------------------------------------------
1 | # Blogs
2 |
3 | ## 2023
4 |
5 |
6 | ```{toctree}
7 | :maxdepth: 1
8 |
9 | benchmark
10 | ```
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/blank-template.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Blank Template
3 | about: Other issues
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
--------------------------------------------------------------------------------
/assets/multimodal-chatbot-demo.gif:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:206296519e7892d65cacc48c7e98c6743301b74c29401d57e325197bd6e41cac
3 | size 79864304
4 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==5.3.0
2 | pydata-sphinx-theme==0.13.1
3 | sphinx_design==0.3.0
4 | myst-parser==1.0.0
5 | sphinx-autoapi==2.0.0
6 | matplotlib==3.4.1
7 | numpydoc==0.9.1
--------------------------------------------------------------------------------
/docs/source/about/index.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 |
4 | ```{toctree}
5 | :maxdepth: 2
6 |
7 | changelog
8 | ```
9 |
10 |
11 | ```{toctree}
12 | :maxdepth: 2
13 |
14 | authors
15 | ```
16 |
--------------------------------------------------------------------------------
/src/lmflow/models/interfaces/tunable.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """Tunable class
4 | """
5 |
6 | from abc import ABC
7 |
8 |
9 | class Tunable(ABC):
10 | pass
11 |
--------------------------------------------------------------------------------
/src/lmflow/pipeline/base_pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """ BasePipeline.
4 | """
5 |
6 | from abc import ABC # abstract class
7 |
8 | class BasePipeline(ABC):
9 | pass
10 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/api-feedback.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: API Feedback
3 | about: Provide feedback regarding the current design of the API.
4 | title: "[API Design]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
--------------------------------------------------------------------------------
/scripts/run_app.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singlegpu_config.yaml service/app.py \
4 | --model_name_or_path gpt2 \
5 | --torch_dtype bfloat16 \
6 | --max_new_tokens 200
--------------------------------------------------------------------------------
/src/lmflow/models/base_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """Base model class.
4 | """
5 |
6 | from abc import ABC
7 |
8 |
9 | class BaseModel(ABC):
10 |
11 | def __init__(self, *args, **kwargs):
12 | pass
13 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.html linguist-detectable=false
2 | *.js linguist-detectable=false
3 | *.ipynb linguist-detectable=false
4 | *RAFT.pdf filter=lfs diff=lfs merge=lfs -text
5 | *.gif filter=lfs diff=lfs merge=lfs -text
6 | assets/*.gif filter=lfs diff=lfs merge=lfs -text
7 |
--------------------------------------------------------------------------------
/examples/ds_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": false
4 | },
5 | "bf16": {
6 | "enabled": true
7 | },
8 | "steps_per_print": 2000,
9 | "train_micro_batch_size_per_gpu": 1,
10 | "wall_clock_breakdown": false
11 | }
12 |
--------------------------------------------------------------------------------
/configs/ds_config_eval.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": false
4 | },
5 | "bf16": {
6 | "enabled": false
7 | },
8 | "steps_per_print": 2000,
9 | "train_micro_batch_size_per_gpu": 1,
10 | "wall_clock_breakdown": false
11 | }
12 |
--------------------------------------------------------------------------------
/scripts/vocab_extension/convert_json_to_txt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cd data && bash download.sh wiki_zh_eval && cd -
4 |
5 | python utils/convert_json_to_txt.py --dataset_path ./data/wiki_zh_eval \
6 | --output_path ./data/wiki_zh_eval/converted_data.txt \
7 | --overwrite True
--------------------------------------------------------------------------------
/src/lmflow/models/regression_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """General regression model."""
4 |
5 | from lmflow.models.base_model import BaseModel
6 |
7 |
8 | class RegressionModel(BaseModel):
9 |
10 | def __init__(self, *args, **kwargs):
11 | pass
12 |
--------------------------------------------------------------------------------
/scripts/vocab_extension/merge_tokenizer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir -p ./output_models/new_tokenizer
3 | python utils/merge_tokenizer.py --tokenizer_dir pinkmanlove/llama-7b-hf \
4 | --chinese_sp_model_file ./output_models/new_tokenizer/example.model \
5 | --output_dir ./output_models/merged_tokenizer \
--------------------------------------------------------------------------------
/docs/source/about/changelog.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 |
4 | ## Version 0.0.1 (Mar 28, 2023)
5 |
6 | The first public version.
7 |
8 | Task tuning, instruction tuning, on user defined datasets.
9 |
10 | A simple and extensible API for developers.
11 |
12 | Efficient finetuning with LoRA.
13 |
14 | Simplified model inference framework.
15 |
16 |
--------------------------------------------------------------------------------
/scripts/vocab_extension/train_tokenizer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir -p ./output_models/merged_tokenizer
3 | python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_data.txt \
4 | --model_type bpe \
5 | --output_dir ./output_models/new_tokenizer \
6 | --user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \
7 | --vocab_size 20000
--------------------------------------------------------------------------------
/scripts/run_evaluation.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES=0 \
4 | deepspeed examples/evaluation.py \
5 | --answer_type medmcqa \
6 | --model_name_or_path gpt2-large \
7 | --dataset_path data/MedQA-USMLE/validation \
8 | --deepspeed examples/ds_config.json \
9 | --inference_batch_size_per_device 1 \
10 | --metric accuracy
11 |
--------------------------------------------------------------------------------
/scripts/run_vis_chatbot_blip2.sh:
--------------------------------------------------------------------------------
1 | model=Salesforce/blip2-opt-2.7b
2 | deepspeed examples/vis_chatbot.py --model_name_or_path ${model} \
3 | --deepspeed configs/ds_config_multimodal.json \
4 | --arch_type vision_encoder_decoder \
5 | --task vqa \
6 | ${@:1}
7 |
--------------------------------------------------------------------------------
/scripts/run_chatbot.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | model=gpt2
4 | lora_args=""
5 | if [ $# -ge 1 ]; then
6 | model=$1
7 | fi
8 | if [ $# -ge 2 ]; then
9 | lora_args="--lora_model_path $2"
10 | fi
11 |
12 | CUDA_VISIBLE_DEVICES=0 \
13 | deepspeed examples/chatbot.py \
14 | --deepspeed configs/ds_config_chatbot.json \
15 | --model_name_or_path ${model} \
16 | ${lora_args}
17 |
--------------------------------------------------------------------------------
/scripts/run_chatbot_cpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | model=gpt2
4 | lora_args=""
5 | if [ $# -ge 1 ]; then
6 | model=$1
7 | fi
8 | if [ $# -ge 2 ]; then
9 | lora_args="--lora_model_path $2"
10 | fi
11 |
12 | CUDA_VISIBLE_DEVICES="" \
13 | python examples/chatbot.py \
14 | --deepspeed configs/ds_config_chatbot.json \
15 | --model_name_or_path ${model} \
16 | --device "cpu" \
17 | ${lora_args}
18 |
--------------------------------------------------------------------------------
/src/lmflow/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | """This Python code defines a class Dataset with methods for initializing, loading,
2 | and manipulating datasets from different backends such as Hugging Face and JSON.
3 |
4 | The `Dataset` class includes methods for loading datasets from a dictionary and a Hugging
5 | Face dataset, mapping datasets, and retrieving the backend dataset and arguments.
6 | """
7 | from lmflow.datasets.dataset import Dataset
8 |
--------------------------------------------------------------------------------
/configs/ds_config_chatbot.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": false
4 | },
5 | "bf16": {
6 | "enabled": true
7 | },
8 | "comms_logger": {
9 | "enabled": false,
10 | "verbose": false,
11 | "prof_all": false,
12 | "debug": false
13 | },
14 | "steps_per_print": 20000000000000000,
15 | "train_micro_batch_size_per_gpu": 1,
16 | "wall_clock_breakdown": false
17 | }
18 |
--------------------------------------------------------------------------------
/configs/ds_config_multimodal.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": false
4 | },
5 | "bf16": {
6 | "enabled": false
7 | },
8 | "comms_logger": {
9 | "enabled": false,
10 | "verbose": false,
11 | "prof_all": false,
12 | "debug": false
13 | },
14 | "steps_per_print": 20000000000000000,
15 | "train_micro_batch_size_per_gpu": 1,
16 | "wall_clock_breakdown": false
17 | }
18 |
--------------------------------------------------------------------------------
/docs/source/api/_autosummary/lmflow.args.rst:
--------------------------------------------------------------------------------
1 | lmflow.args
2 | ===========
3 |
4 | .. automodule:: lmflow.args
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | .. rubric:: Classes
17 |
18 | .. autosummary::
19 |
20 | DatasetArguments
21 | FinetunerArguments
22 | InferencerArguments
23 | ModelArguments
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/scripts/run_chatbot_chatglm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | model=THUDM/chatglm-6b
4 | lora_args=""
5 | if [ $# -ge 1 ]; then
6 | model=$1
7 | fi
8 | if [ $# -ge 2 ]; then
9 | lora_args="--lora_model_path $2"
10 | fi
11 |
12 | CUDA_VISIBLE_DEVICES=0 \
13 | deepspeed examples/chatbot.py \
14 | --arch_type encoder_decoder \
15 | --deepspeed configs/ds_config_chatbot.json \
16 | --model_name_or_path ${model} \
17 | ${lora_args}
--------------------------------------------------------------------------------
/configs/accelerator_singlegpu_config.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | distributed_type: 'NO'
3 | downcast_bf16: 'no'
4 | dynamo_config:
5 | dynamo_backend: INDUCTOR
6 | gpu_ids:
7 | machine_rank: 0
8 | main_training_function: main
9 | mixed_precision: bf16
10 | num_machines: 1
11 | num_processes: 1
12 | rdzv_backend: static
13 | same_network: true
14 | tpu_env: []
15 | tpu_use_cluster: false
16 | tpu_use_sudo: false
17 | use_cpu: false
18 |
--------------------------------------------------------------------------------
/configs/accelerator_multigpu_config.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | distributed_type: MULTI_GPU
3 | downcast_bf16: 'no'
4 | dynamo_config:
5 | dynamo_backend: INDUCTOR
6 | gpu_ids:
7 | machine_rank: 0
8 | main_training_function: main
9 | mixed_precision: bf16
10 | num_machines: 1
11 | num_processes: 2
12 | rdzv_backend: static
13 | same_network: true
14 | tpu_env: []
15 | tpu_use_cluster: false
16 | tpu_use_sudo: false
17 | use_cpu: false
18 | main_process_port: 11000
19 |
--------------------------------------------------------------------------------
/scripts/run_inference_multimodal_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | model="Salesforce/blip-image-captioning-base"
4 | lora_args=""
5 | if [ $# -ge 1 ]; then
6 | model=$1
7 | fi
8 | if [ $# -ge 2 ]; then
9 | lora_args="--lora_model_path $2"
10 | fi
11 |
12 | CUDA_VISIBLE_DEVICES=0 \
13 | deepspeed examples/inference.py \
14 | --deepspeed configs/ds_config_multimodal.json \
15 | --model_name_or_path ${model} \
16 | --arch_type vision_encoder_decoder \
17 | ${lora_args}
18 |
--------------------------------------------------------------------------------
/src/lmflow/__init__.py:
--------------------------------------------------------------------------------
1 | from .version import __version__ as internal_version
2 |
3 | __version__ = internal_version
4 |
5 | from transformers.utils import check_min_version
6 | from transformers.utils.versions import require_version
7 |
8 | from lmflow import args, datasets, models, pipeline, utils
9 |
10 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
11 | check_min_version("4.27.0.dev0")
12 |
13 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
--------------------------------------------------------------------------------
/scripts/run_evaluation_accelerator.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singlegpu_config.yaml examples/evaluation.py \
4 | --answer_type usmle \
5 | --model_name_or_path gpt2-large \
6 | --dataset_path data/MedQA-USMLE/validation \
7 | --use_ram_optimized_load True \
8 | --deepspeed examples/ds_config.json \
9 | --metric accuracy \
10 | --output_dir output_dir/accelerator_1_card \
11 | --inference_batch_size_per_device 1 \
12 | --use_accelerator_for_evaluator True \
13 | --torch_dtype bfloat16
14 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04
2 |
3 | ENV TZ=Etc/UTC
4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
5 |
6 | RUN apt-get update --fix-missing && apt-get install -y fontconfig --fix-missing
7 | RUN apt-get install -y libopenmpi-dev
8 | RUN apt-get install -y git python3.9 python3.9-dev python3.9-venv
9 | RUN python3.9 -m venv /venv
10 | ENV PATH=/venv/bin:$PATH
11 | RUN pip install mpi4py
12 |
13 | ARG SRCDIR
14 |
15 | RUN mkdir /LMFlow/
16 | WORKDIR /LMFlow/
17 |
18 | COPY $SRCDIR/ /LMFlow/
19 |
20 | RUN pip install wheel
21 | RUN pip install -e .
22 |
--------------------------------------------------------------------------------
/src/lmflow/pipeline/base_tuner.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """ BaseTuner: a subclass of BasePipeline.
4 | """
5 |
6 | from lmflow.pipeline.base_pipeline import BasePipeline
7 |
8 |
9 | class BaseTuner(BasePipeline):
10 | """ A subclass of BasePipeline which is tunable.
11 | """
12 | def __init__(self, *args, **kwargs):
13 | pass
14 |
15 | def _check_if_tunable(self, model, dataset):
16 | # TODO: check if the model is tunable and dataset is compatible
17 | pass
18 |
19 | def tune(self, model, dataset):
20 | raise NotImplementedError(".tune is not implemented")
21 |
--------------------------------------------------------------------------------
/src/lmflow/models/decoder_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """A one-line summary of the module or program, terminated by a period.
4 |
5 | Leave one blank line. The rest of this docstring should contain an
6 | overall description of the module or program. Optionally, it may also
7 | contain a brief description of exported classes and functions and/or usage
8 | examples.
9 |
10 | Typical usage example:
11 |
12 | foo = ClassFoo()
13 | bar = foo.FunctionBar()
14 | """
15 |
16 | from lmflow.models.base_model import BaseModel
17 |
18 |
19 | class DecoderModel(BaseModel):
20 |
21 | def __init__(self, *args, **kwargs):
22 | pass
23 |
--------------------------------------------------------------------------------
/src/lmflow/models/encoder_decoder_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """A one-line summary of the module or program, terminated by a period.
4 |
5 | Leave one blank line. The rest of this docstring should contain an
6 | overall description of the module or program. Optionally, it may also
7 | contain a brief description of exported classes and functions and/or usage
8 | examples.
9 |
10 | Typical usage example:
11 |
12 | foo = ClassFoo()
13 | bar = foo.FunctionBar()
14 | """
15 |
16 | from lmflow.models.base_model import BaseModel
17 |
18 |
19 | class EncoderDecoderModel(BaseModel):
20 |
21 | def __init__(self, *args, **kwargs):
22 | pass
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.24.2
2 | datasets==2.10.1
3 | peft @ git+https://github.com/huggingface/peft.git@deff03f2c251534fffd2511fc2d440e84cc54b1b
4 | torch==2.0.0
5 | wandb==0.14.0
6 | deepspeed==0.8.3
7 | trl @ git+https://github.com/lvwerra/trl.git#egg=trl-0.4.1
8 | sentencepiece
9 | transformers @ git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda
10 | flask
11 | flask_cors
12 | icetk
13 | cpm_kernels==1.0.11
14 | evaluate==0.4.0
15 | scikit-learn==1.2.2
16 | lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@e47e01beea79cfe87421e2dac49e64d499c240b4
17 | dill<0.3.5
18 | bitsandbytes==0.38.1
19 | pydantic<=1.10.9
20 | gradio
21 |
--------------------------------------------------------------------------------
/scripts/run_evaluation_with_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # --model_name_or_path specifies the original huggingface model
4 | # --lora_model_path specifies the model difference introduced by finetuning,
5 | # i.e. the one saved by ./scripts/run_finetune_with_lora.sh
6 | CUDA_VISIBLE_DEVICES=0 \
7 | deepspeed examples/evaluation.py \
8 | --answer_type text \
9 | --model_name_or_path facebook/galactica-1.3b \
10 | --lora_model_path output_models/finetune_with_lora \
11 | --dataset_path data/alpaca/test \
12 | --prompt_structure "Input: {input}" \
13 | --deepspeed examples/ds_config.json \
14 | --inference_batch_size_per_device 1 \
15 | --metric accuracy
16 |
--------------------------------------------------------------------------------
/scripts/run_benchmark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ "$1" == "-h" -o "$1" == "--help" ]; then
4 | help_message="./$(basename $0)"
5 | help_message+=" --dataset_name DATASET_NAME"
6 | help_message+=" --model_name_or_path MODEL_NAME_OR_PATH"
7 | echo ${help_message} 1>&2
8 | exit 1
9 | fi
10 |
11 | extra_args="--dataset_name gpt4_en_eval --model_name_or_path gpt2"
12 | if [ $# -ge 1 ]; then
13 | extra_args="$@"
14 | fi
15 |
16 |
17 | CUDA_VISIBLE_DEVICES=0 \
18 | deepspeed --master_port 11001 examples/benchmarking.py \
19 | --use_ram_optimized_load 0 \
20 | --deepspeed examples/ds_config.json \
21 | --metric nll \
22 | --prompt_structure "###Human: {input}###Assistant:" \
23 | ${extra_args}
--------------------------------------------------------------------------------
/src/lmflow/pipeline/base_aligner.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """ BaseTuner: a subclass of BasePipeline.
4 | """
5 |
6 | from lmflow.pipeline.base_pipeline import BasePipeline
7 |
8 |
9 | class BaseAligner(BasePipeline):
10 | """ A subclass of BasePipeline which is alignable.
11 | """
12 | def __init__(self, *args, **kwargs):
13 | pass
14 |
15 | def _check_if_alignable(self, model, dataset, reward_model):
16 | # TODO: check if the model is alignable and dataset is compatible
17 | # TODO: add reward_model
18 | pass
19 |
20 | def align(self, model, dataset, reward_model):
21 | raise NotImplementedError(".align is not implemented")
22 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature Request
3 | about: Suggest an idea for this project
4 | title: "[New Feature]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/workflows/documentation.yaml:
--------------------------------------------------------------------------------
1 | name: Docs
2 | on: [push, pull_request, workflow_dispatch]
3 | jobs:
4 | docs:
5 | runs-on: ubuntu-latest
6 | steps:
7 | - uses: actions/checkout@v3
8 | - uses: actions/setup-python@v3
9 | - name: Install current pkg
10 | run: |
11 | pip install -e .
12 | - name: Install dependencies
13 | run: |
14 | pip install -r ./docs/requirements.txt
15 | - name: Sphinx build
16 | run: |
17 | sphinx-build docs/source _build
18 | - name: Deploy
19 | uses: peaceiris/actions-gh-pages@v3
20 | with:
21 | publish_branch: gh-pages
22 | github_token: ${{ secrets.GITHUB_TOKEN }}
23 | publish_dir: _build/
24 | force_orphan: true
25 |
--------------------------------------------------------------------------------
/docs/source/examples/index.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | We provide several examples to show how to use our package in your problem.
4 |
5 | ## Data preparation
6 |
7 | ```{toctree}
8 | :maxdepth: 3
9 |
10 | DATASETS
11 | ```
12 |
13 | ```{toctree}
14 | :maxdepth: 3
15 |
16 | checkpoints
17 | ```
18 |
19 | ## Finetuning
20 |
21 | For SFT, Refer to [examples](https://github.com/OptimalScale/LMFlow/blob/main/examples).
22 |
23 |
24 | For alignment process,
25 |
26 | ```{toctree}
27 | :maxdepth: 3
28 |
29 | reward_modeling
30 | ```
31 |
32 |
33 | ```{toctree}
34 | :maxdepth: 3
35 |
36 | raft
37 | ```
38 |
39 | ## Inference
40 |
41 | Refer to [examples](https://github.com/OptimalScale/LMFlow/blob/main/examples).
42 |
43 | ## Evaluation
44 |
45 | ```{toctree}
46 | :maxdepth: 3
47 |
48 | TASK_GUIDE
49 | ```
50 |
51 |
52 |
--------------------------------------------------------------------------------
/scripts/run_vis_chatbot_minigpt4.sh:
--------------------------------------------------------------------------------
1 | model=Salesforce/blip2-flan-t5-xxl
2 | checkpoint_path=/scratch/PI/tongzhang/qinglian/checkpoints/pretrained_weights/minigpt4/prerained_minigpt4_7b_converted.pth
3 | llm_model_name_or_path=/scratch/PI/tongzhang/qinglian/checkpoints/pretrained_weights/vicuna-7b/
4 | deepspeed examples/vis_chatbot.py --model_name_or_path ${model} --deepspeed configs/ds_config_multimodal.json --arch_type vision_encoder_decoder --task vqa --custom_model \
5 | --prompt_format mini_gpt \
6 | --prompt_structure "{input_text}###Assistant:" \
7 | --checkpoint_path ${checkpoint_path} \
8 | --llm_model_name_or_path ${llm_model_name_or_path} \
9 | --low_resource True \
10 | ${@:1}
11 |
12 |
--------------------------------------------------------------------------------
/configs/ds_config_zero3_for_eval.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": {
3 | "enabled": true
4 | },
5 | "zero_optimization": {
6 | "stage": 3,
7 | "offload_optimizer": {
8 | "device": "cpu",
9 | "pin_memory": true
10 | },
11 | "offload_param": {
12 | "device": "cpu",
13 | "pin_memory": true
14 | },
15 | "overlap_comm": true,
16 | "contiguous_gradients": true,
17 | "sub_group_size": 1e9,
18 | "reduce_bucket_size": "auto",
19 | "stage3_prefetch_bucket_size": "auto",
20 | "stage3_param_persistence_threshold": "auto",
21 | "stage3_max_live_parameters": 1e9,
22 | "stage3_max_reuse_distance": 1e9,
23 | "stage3_gather_16bit_weights_on_model_save": true
24 | },
25 |
26 | "steps_per_print": 2000,
27 | "train_micro_batch_size_per_gpu": 1,
28 | "wall_clock_breakdown": false
29 | }
30 |
--------------------------------------------------------------------------------
/scripts/.nfs0000000094418362000004c4:
--------------------------------------------------------------------------------
1 | model=Salesforce/blip2-flan-t5-xxl
2 | checkpoint_path=$1
3 | llm_model_name_or_path=$2
4 | deepspeed examples/vis_chatbot_gradio.py --model_name_or_path ${model} \
5 | --deepspeed configs/ds_config_multimodal.json \
6 | --arch_type vision_encoder_decoder \
7 | --task vqa \
8 | --custom_model \
9 | --prompt_format mini_gpt \
10 | --prompt_structure "{input_text}###Assistant:" \
11 | --checkpoint_path ${checkpoint_path} \
12 | --llm_model_name_or_path ${llm_model_name_or_path} \
13 | --low_resource True \
14 | ${@:3}
15 |
--------------------------------------------------------------------------------
/scripts/vocab_extension/train_merge_tokenizer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # download data
4 | cd data && bash download.sh wiki_zh_eval && cd -
5 |
6 | # convert json to txt for sentencepiece
7 | python utils/convert_json_to_txt.py --dataset_path ./data/wiki_zh_eval \
8 | --output_path ./data/wiki_zh_eval/converted_data.txt \
9 | --overwrite True
10 |
11 | # train a new tokenizer
12 | mkdir -p ./output_models/new_tokenizer
13 | python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_data.txt \
14 | --model_type bpe \
15 | --output_dir ./output_models/new_tokenizer \
16 | --user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \
17 | --vocab_size 20000
18 |
19 | # merge the new tokenizer with the old one
20 | mkdir -p ./output_models/merged_tokenizer
21 | python utils/merge_tokenizer.py --chinese_sp_model_file ./output_models/new_tokenizer/example.model \
22 | --tokenizer_dir pinkmanlove/llama-7b-hf \
23 | --output_dir ./output_models/merged_tokenizer
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug Report
3 | about: Create a report to help us improve
4 | title: "[BUG]"
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/src/lmflow/models/auto_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """Automatically get correct model type.
4 | """
5 |
6 | from lmflow.models.hf_decoder_model import HFDecoderModel
7 | from lmflow.models.text_regression_model import TextRegressionModel
8 | from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
9 |
10 | class AutoModel:
11 |
12 | @classmethod
13 | def get_model(self, model_args, *args, **kwargs):
14 | arch_type = model_args.arch_type
15 | if arch_type == "decoder_only":
16 | return HFDecoderModel(model_args, *args, **kwargs)
17 | elif arch_type == "text_regression":
18 | return TextRegressionModel(model_args, *args, **kwargs)
19 | elif arch_type == "encoder_decoder" or \
20 | arch_type == "vision_encoder_decoder":
21 | return HFEncoderDecoderModel(model_args, *args, **kwargs)
22 | else:
23 | raise NotImplementedError(
24 | f"model architecture type \"{arch_type}\" is not supported"
25 | )
26 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # LMFlow
2 |
3 | We welcome contributions from the open-source community with open arms! We value and appreciate all types of participation, not just code. Whether you're answering questions, offering help, improving the documentation, or simply reaching out, your contributions are immensely valuable to us. So, if you're interested, don't hesitate to get involved!
4 |
5 | To start, we encourage everyone to say hello in our public Discord channel. Here, we discuss the latest trends in Large Foundation models, showcase personal projects, help each other with contributions, or just hang out over a cup of coffee. Join us on Discord!
6 |
7 | No matter how you choose to contribute, we strive to maintain an open, welcoming, and kind community. We ask that you read our code of conduct and be respectful during your interactions. It's also essential that you become familiar with the ethical guidelines that guide our project and adhere to the same principles of transparency and responsibility.
8 |
9 | We highly value feedback from the community, so please don't hesitate to speak up if you have any valuable feedback that can help improve the library. We read and consider every message, comment, issue, and pull request (PR).
10 |
--------------------------------------------------------------------------------
/configs/ds_config_zero2.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 |
11 | "bf16": {
12 | "enabled": "auto"
13 | },
14 |
15 | "optimizer": {
16 | "type": "AdamW",
17 | "params": {
18 | "lr": "auto",
19 | "betas": "auto",
20 | "eps": "auto",
21 | "weight_decay": "auto"
22 | }
23 | },
24 |
25 | "zero_optimization": {
26 | "stage": 2,
27 | "offload_optimizer": {
28 | "device": "cpu",
29 | "pin_memory": true
30 | },
31 | "allgather_partitions": true,
32 | "allgather_bucket_size": 2e8,
33 | "overlap_comm": true,
34 | "reduce_scatter": true,
35 | "reduce_bucket_size": 2e8,
36 | "contiguous_gradients": true
37 | },
38 |
39 | "gradient_accumulation_steps": "auto",
40 | "gradient_clipping": "auto",
41 | "steps_per_print": 2000,
42 | "train_batch_size": "auto",
43 | "train_micro_batch_size_per_gpu": "auto",
44 | "wall_clock_breakdown": false
45 | }
46 |
--------------------------------------------------------------------------------
/scripts/run_finetune.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Please run this script under ${project_id} in project directory of
3 | # https://github.com/shizhediao/llm-ft
4 | # COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4
5 |
6 | deepspeed_args="--master_port=11000" # Default argument
7 | if [ $# -ge 1 ]; then
8 | deepspeed_args="$1"
9 | fi
10 |
11 | exp_id=finetune
12 | project_dir=$(cd "$(dirname $0)"/..; pwd)
13 | output_dir=${project_dir}/output_models/${exp_id}
14 | log_dir=${project_dir}/log/${exp_id}
15 |
16 | dataset_path=${project_dir}/data/alpaca/train
17 |
18 | mkdir -p ${output_dir} ${log_dir}
19 |
20 | deepspeed ${deepspeed_args} \
21 | examples/finetune.py \
22 | --model_name_or_path gpt2 \
23 | --dataset_path ${dataset_path} \
24 | --output_dir ${output_dir} --overwrite_output_dir \
25 | --num_train_epochs 0.01 \
26 | --learning_rate 2e-5 \
27 | --block_size 512 \
28 | --per_device_train_batch_size 1 \
29 | --deepspeed configs/ds_config_zero3.json \
30 | --bf16 \
31 | --run_name finetune \
32 | --validation_split_percentage 0 \
33 | --logging_steps 20 \
34 | --do_train \
35 | --ddp_timeout 72000 \
36 | --save_steps 5000 \
37 | --dataloader_num_workers 1 \
38 | | tee ${log_dir}/train.log \
39 | 2> ${log_dir}/train.err
40 |
--------------------------------------------------------------------------------
/src/lmflow/pipeline/auto_pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """Return a pipeline automatically based on its name.
4 | """
5 |
6 | from lmflow.pipeline.evaluator import Evaluator
7 | from lmflow.pipeline.finetuner import Finetuner
8 | from lmflow.pipeline.inferencer import Inferencer
9 | from lmflow.pipeline.raft_aligner import RaftAligner
10 |
11 |
12 | PIPELINE_MAPPING = {
13 | "evaluator": Evaluator,
14 | "finetuner": Finetuner,
15 | "inferencer": Inferencer,
16 | "raft_aligner": RaftAligner,
17 | }
18 |
19 |
20 | class AutoPipeline:
21 | """
22 | The class designed to return a pipeline automatically based on its name.
23 | """
24 | @classmethod
25 | def get_pipeline(self,
26 | pipeline_name,
27 | model_args,
28 | data_args,
29 | pipeline_args,
30 | *args,
31 | **kwargs
32 | ):
33 | if pipeline_name not in PIPELINE_MAPPING:
34 | raise NotImplementedError(
35 | f'Pipeline "{pipeline_name}" is not supported'
36 | )
37 |
38 | pipeline = PIPELINE_MAPPING[pipeline_name](
39 | model_args,
40 | data_args,
41 | pipeline_args,
42 | *args,
43 | **kwargs
44 | )
45 | return pipeline
46 |
--------------------------------------------------------------------------------
/scripts/run_finetune_with_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Please run this script under ${project_id} in project directory of
3 |
4 | deepspeed_args="--master_port=11000" # Default argument
5 | if [ $# -ge 1 ]; then
6 | deepspeed_args="$1"
7 | fi
8 |
9 | exp_id=finetune_with_lora
10 | project_dir=$(cd "$(dirname $0)"/..; pwd)
11 | output_dir=${project_dir}/output_models/${exp_id}
12 | log_dir=${project_dir}/log/${exp_id}
13 |
14 | dataset_path=${project_dir}/data/alpaca/train
15 |
16 | mkdir -p ${output_dir} ${log_dir}
17 |
18 | deepspeed ${deepspeed_args} \
19 | examples/finetune.py \
20 | --model_name_or_path facebook/galactica-1.3b \
21 | --dataset_path ${dataset_path} \
22 | --output_dir ${output_dir} --overwrite_output_dir \
23 | --num_train_epochs 0.01 \
24 | --learning_rate 1e-4 \
25 | --block_size 512 \
26 | --per_device_train_batch_size 1 \
27 | --use_lora 1 \
28 | --lora_r 8 \
29 | --save_aggregated_lora 0\
30 | --deepspeed configs/ds_config_zero2.json \
31 | --bf16 \
32 | --run_name finetune_with_lora \
33 | --validation_split_percentage 0 \
34 | --logging_steps 20 \
35 | --do_train \
36 | --ddp_timeout 72000 \
37 | --save_steps 5000 \
38 | --dataloader_num_workers 1 \
39 | | tee ${log_dir}/train.log \
40 | 2> ${log_dir}/train.err
41 |
--------------------------------------------------------------------------------
/utils/train_tokenizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import argparse
5 | import os
6 | import sentencepiece as spm
7 |
8 | if __name__ == '__main__':
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--dataset_path', default='./data/wiki_zh_eval/converted_data.txt', type=str, required=False)
12 | parser.add_argument('--output_dir', default='./output_models/new_tokenizer', type=str, required=False)
13 | parser.add_argument('--vocab_size', default=20000, type=int, required=False)
14 | parser.add_argument('--model_type', default='bpe', type=str, required=False)
15 | parser.add_argument('--user_defined_symbols', default='0,1,2,3,4,5,6,7,8,9,%', type=str, required=False)
16 | args = parser.parse_args()
17 |
18 | dataset_path = args.dataset_path
19 | output_dir = args.output_dir
20 | vocab_size = args.vocab_size
21 | model_type = args.model_type
22 | user_defined_symbols = args.user_defined_symbols
23 |
24 | def mkdir(path):
25 | if not os.path.exists(path):
26 | os.makedirs(path)
27 | mkdir(output_dir)
28 |
29 | spm.SentencePieceTrainer.train('--input={} --model_prefix={} --model_type={} --vocab_size={} --user_defined_symbols={} --minloglevel=1'.format(dataset_path,output_dir+'/example',model_type,vocab_size,user_defined_symbols))
--------------------------------------------------------------------------------
/utils/convert_minigpt4_checkpoints.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os.path as osp
3 | import torch
4 |
5 | def parse_args():
6 | parser = argparse.ArgumentParser(description="Convert checkpoint from MiniGPT4")
7 | parser.add_argument("--model_path", type=str, help="the model path for the to convert checkpoint")
8 | parser.add_argument("--save_path", default=None, type=str, help="the save path for converted checkpoint")
9 | args = parser.parse_args()
10 | return args
11 |
12 |
13 |
14 |
15 |
16 | if __name__ == "__main__":
17 | args = parse_args()
18 | model = torch.load(args.model_path)
19 | model = model['model']
20 | new_model = {}
21 | for key, item in model.items():
22 | key = key.replace("Qformer", "qformer")
23 | key = key.replace("llama_proj", "language_projection")
24 | key = key.replace("llama_model.model", "language_model.model")
25 | new_model[key] = item
26 | if args.save_path is None:
27 | end_string = osp.splitext(args.model_path)
28 | save_path = osp.dirname(args.model_path) + "/" + \
29 | osp.basename(args.model_path).replace(".pth", "") + \
30 | "-converted" + osp.splitext(args.model_path)[-1]
31 | else:
32 | save_path = args.save_path
33 | print("save_path: {}".format(save_path))
34 |
35 | torch.save(new_model, save_path)
36 |
--------------------------------------------------------------------------------
/scripts/run_multistage_finetune.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Please run this script under ${project_id} in project directory of
3 |
4 | deepspeed_args="--master_port=11000" # Default argument
5 | if [ $# -ge 1 ]; then
6 | deepspeed_args="$1"
7 | fi
8 |
9 | exp_id=multistage_finetune
10 | project_dir=$(cd "$(dirname $0)"/..; pwd)
11 | output_dir=${project_dir}/output_models/${exp_id}
12 | log_dir=${project_dir}/log/${exp_id}
13 | dataset_path="${project_dir}/data/example_dataset/train"
14 |
15 | mkdir -p ${output_dir} ${log_dir}
16 |
17 | deepspeed ${deepspeed_args} \
18 | examples/multistage_finetune.py \
19 | --num_stages_per_epoch 1 \
20 | --run_name ${exp_id} \
21 | --model_name_or_path facebook/galactica-1.3b \
22 | --dataset_path ${dataset_path} \
23 | --output_dir ${output_dir} --overwrite_output_dir \
24 | --num_train_epochs 3 \
25 | --learning_rate 1e-3 \
26 | --block_size 512 \
27 | --per_device_train_batch_size 2 \
28 | --use_lora 1 \
29 | --lora_r 8 \
30 | --save_aggregated_lora 1 \
31 | --deepspeed configs/ds_config_zero2.json \
32 | --bf16 \
33 | --run_name finetune_with_lora \
34 | --validation_split_percentage 0 \
35 | --logging_steps 20 \
36 | --do_train \
37 | --ddp_timeout 72000 \
38 | --save_steps 5000 \
39 | --dataloader_num_workers 1 \
40 | | tee ${log_dir}/train.log \
41 | 2> ${log_dir}/train.err
42 |
--------------------------------------------------------------------------------
/examples/merge_lora.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """
5 | Merge base model and lora model into a full model.
6 | """
7 |
8 | import sys
9 | import os
10 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
11 |
12 | from dataclasses import dataclass, field
13 | from transformers import HfArgumentParser
14 | from typing import Optional
15 |
16 | from lmflow.args import (
17 | ModelArguments,
18 | AutoArguments,
19 | )
20 |
21 | from lmflow.models.auto_model import AutoModel
22 |
23 |
24 | @dataclass
25 | class MergeLoraArguments:
26 | output_model_path: Optional[str] = field(
27 | default=None,
28 | metadata={
29 | "help": "output merged full model path"
30 | },
31 | )
32 |
33 |
34 | def main():
35 | parser = HfArgumentParser((ModelArguments, MergeLoraArguments))
36 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
37 | model_args, merge_lora_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
38 | else:
39 | model_args, merge_lora_args = parser.parse_args_into_dataclasses()
40 |
41 | model_args.use_lora = True
42 | model = AutoModel.get_model(model_args)
43 | model.merge_lora_weights()
44 | model.save(merge_lora_args.output_model_path, save_full_model=True)
45 |
46 |
47 | if __name__ == '__main__':
48 | main()
49 |
--------------------------------------------------------------------------------
/scripts/run_reward_modeling.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Please run this script under ${project_id} in project directory of
3 | # https://github.com/shizhediao/llm-ft
4 | # COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4
5 |
6 | deepspeed_args="--master_port=11000" # Default argument
7 | if [ $# -ge 1 ]; then
8 | deepspeed_args="$1"
9 | fi
10 |
11 | exp_id=rm
12 | project_dir=$(cd "$(dirname $0)"/..; pwd)
13 | output_dir=${project_dir}/output_models/${exp_id}
14 | log_dir=${project_dir}/log/${exp_id}
15 |
16 | dataset_path=${project_dir}/data/hh_rlhf/rm/hh_rlhf_rm_training.json
17 |
18 | mkdir -p ${output_dir} ${log_dir}
19 |
20 | deepspeed ${deepspeed_args} \
21 | examples/reward_modeling.py \
22 | --model_name_or_path gpt2 \
23 | --dataset_path ${dataset_path} \
24 | --output_dir ${output_dir} --overwrite_output_dir \
25 | --num_train_epochs 1 \
26 | --learning_rate 3e-5 \
27 | --block_size 512 \
28 | --per_device_train_batch_size 1 \
29 | --per_device_eval_batch_size 1\
30 | --deepspeed configs/ds_config_zero2.json \
31 | --bf16 \
32 | --run_name rm_test \
33 | --validation_split_percentage 10 \
34 | --logging_steps 10 \
35 | --do_train \
36 | --ddp_timeout 72000 \
37 | --save_steps 999999 \
38 | --evaluation_strategy steps\
39 | --eval_steps 100\
40 | --weight_decay 0.001\
41 | --dataloader_num_workers 1 \
42 | | tee ${log_dir}/train.log \
43 | 2> ${log_dir}/train.err
44 |
--------------------------------------------------------------------------------
/utils/convert_json_to_txt.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import argparse
5 | import logging
6 |
7 | import json
8 | from pathlib import Path
9 |
10 | logging.basicConfig(level=logging.WARNING)
11 |
12 | if __name__ == '__main__':
13 |
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--dataset_path', default='./data/wiki_zh_eval', type=str, required=False)
16 | parser.add_argument('--output_path', default='./data/wiki_zh_eval/converted_data.txt', type=str, required=False)
17 | parser.add_argument('--overwrite', default=False, type=bool, required=False)
18 | args = parser.parse_args()
19 |
20 | dataset_path = args.dataset_path
21 | outputfile = args.output_path
22 |
23 | outputs_list = []
24 | data_files = [
25 | x.absolute().as_posix()
26 | for x in Path(dataset_path).glob("*.json")
27 | ]
28 |
29 | for file_name in data_files:
30 | with open(file_name) as fin:
31 | json_data = json.load(fin)
32 | type = json_data["type"]
33 | for line in json_data["instances"]:
34 | outputs_list.append(line["text"])
35 |
36 |
37 | if Path(outputfile).exists() and not args.overwrite:
38 | logging.warning(f"File %s exists, will not overwrite.", outputfile)
39 | else:
40 | with open(outputfile, "w") as f:
41 | for line in outputs_list:
42 | f.write(line)
43 |
44 |
--------------------------------------------------------------------------------
/docs/source/examples/checkpoints.md:
--------------------------------------------------------------------------------
1 | # Checkpoints
2 |
3 | In general, you can directly load from checkpoints by using `--model_name_or_path`. However, the LLaMA case is slightly different due to the copyright issue.
4 |
5 |
6 | ## LLaMA Checkpoint
7 |
8 | 1. First, you need to get the access of LLaMA model from [facebookresearch/llama](https://github.com/facebookresearch/llama). Download the official checkpoints and save them into `${llama-path}`.
9 |
10 | 2. Second, convert the official checkpoints `${llama-path}` to HuggingFace supported checkpoints `${llama-hf-path}` by running
11 |
12 | `python ./scripts/convert_llama_weights_to_hf.py --input_dir ${llama-path} --model_size 7B --output_dir ${llama-hf-path}/llama-7b-hf`
13 |
14 | 3. Then you are good to go by setting the checkpoint path to `${llama-hf-path}/llama-7b-hf`. Enjoy it!
15 |
16 | 4. (optional) Now you have the original llama-7b-hf pretrained model. With
17 | ```sh
18 | cd output_models && ./download.sh all && cd -
19 | ```
20 | You can obtain the model difference finetuned by ours. By a way similar to `./scripts/run_evaluation_with_lora.sh`,
21 | ```sh
22 | CUDA_VISIBLE_DEVICES=0 \
23 | deepspeed examples/evaluate.py \
24 | --answer_type text \
25 | --model_name_or_path ${llama-hf-path}/llama-7b-hf \
26 | --lora_model_path output_models/${llama-model-diff-path} \
27 | --dataset_path data/alpaca/test \
28 | --prompt_structure "Input: {input}" \
29 | --deepspeed examples/ds_config.json
30 | ```
31 | You can now evaluate with the finetuned llama model.
--------------------------------------------------------------------------------
/scripts/run_finetune_with_lora_save_aggregated_weights.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Please run this script under ${project_id} in project directory of
3 |
4 | deepspeed_args="--master_port=11000" # Default argument
5 | if [ $# -ge 1 ]; then
6 | deepspeed_args="$1"
7 | fi
8 |
9 | exp_id=finetune_with_lora
10 | project_dir=$(cd "$(dirname $0)"/..; pwd)
11 | output_dir=${project_dir}/output_models/${exp_id}
12 | log_dir=${project_dir}/log/${exp_id}
13 |
14 | dataset_path=${project_dir}/data/alpaca/train
15 | eval_dataset_path=${project_dir}/data/alpaca/test
16 |
17 | mkdir -p ${output_dir} ${log_dir}
18 |
19 | deepspeed ${deepspeed_args} \
20 | examples/finetune.py \
21 | --model_name_or_path facebook/galactica-1.3b \
22 | --dataset_path ${dataset_path} \
23 | --output_dir ${output_dir} --overwrite_output_dir \
24 | --num_train_epochs 0.01 \
25 | --learning_rate 1e-4 \
26 | --block_size 512 \
27 | --per_device_train_batch_size 1 \
28 | --use_lora 1 \
29 | --lora_r 8 \
30 | --save_aggregated_lora 1\
31 | --deepspeed configs/ds_config_zero2.json \
32 | --bf16 \
33 | --run_name finetune_with_lora \
34 | --validation_split_percentage 0 \
35 | --logging_steps 20 \
36 | --do_train \
37 | --do_eval \
38 | --evaluation_strategy "steps" \
39 | --eval_steps 1000 \
40 | --eval_dataset_path ${eval_dataset_path} \
41 | --ddp_timeout 72000 \
42 | --save_steps 5000 \
43 | --dataloader_num_workers 1 \
44 | | tee ${log_dir}/train.log \
45 | 2> ${log_dir}/train.err
46 |
--------------------------------------------------------------------------------
/configs/ds_config_zero3.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 |
11 | "bf16": {
12 | "enabled": "auto"
13 | },
14 |
15 | "optimizer": {
16 | "type": "AdamW",
17 | "params": {
18 | "lr": "auto",
19 | "betas": "auto",
20 | "eps": "auto",
21 | "weight_decay": "auto"
22 | }
23 | },
24 |
25 | "zero_optimization": {
26 | "stage": 3,
27 | "offload_optimizer": {
28 | "device": "cpu",
29 | "pin_memory": true
30 | },
31 | "offload_param": {
32 | "device": "cpu",
33 | "pin_memory": true
34 | },
35 | "overlap_comm": true,
36 | "contiguous_gradients": true,
37 | "sub_group_size": 1e9,
38 | "reduce_bucket_size": "auto",
39 | "stage3_prefetch_bucket_size": "auto",
40 | "stage3_param_persistence_threshold": "auto",
41 | "stage3_max_live_parameters": 1e9,
42 | "stage3_max_reuse_distance": 1e9,
43 | "stage3_gather_16bit_weights_on_model_save": true
44 | },
45 |
46 | "gradient_accumulation_steps": "auto",
47 | "gradient_clipping": "auto",
48 | "steps_per_print": 2000,
49 | "train_batch_size": "auto",
50 | "train_micro_batch_size_per_gpu": "auto",
51 | "wall_clock_breakdown": false
52 | }
53 |
--------------------------------------------------------------------------------
/scripts/run_raft_align.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Please run this script under project directory.
3 |
4 | deepspeed_args="--master_port=11110" # Default argument
5 | if [ $# -ge 1 ]; then
6 | deepspeed_args="$1"
7 | fi
8 |
9 | exp_id=raft_align
10 | project_dir=$(cd "$(dirname $0)"/..; pwd)
11 | output_dir=${project_dir}/output_models/${exp_id}
12 | log_dir=${project_dir}/log/${exp_id}
13 |
14 | mkdir -p ${output_dir} ${log_dir}
15 |
16 | export PYTHONPATH=.
17 | deepspeed ${deepspeed_args} \
18 | examples/raft_align.py \
19 | --model_name_or_path gpt2 \
20 | --num_raft_iteration 20 \
21 | --learning_rate 2e-5 \
22 | --lr_scheduler_type "constant" \
23 | --bf16 \
24 | --deepspeed configs/ds_config_zero2.json \
25 | --dataset_path ${project_dir}/data/hh_rlhf/rlhf_prompt \
26 | --output_reward_path ${project_dir}/tmp/raft_aligner/reward.txt \
27 | --output_dir ${output_dir} --overwrite_output_dir \
28 | --run_name ${exp_id} \
29 | --num_train_epochs 4 \
30 | --per_device_train_batch_size 1 \
31 | --per_device_eval_batch_size 1 \
32 | --validation_split_percentage 0 \
33 | --logging_steps 1 \
34 | --do_train \
35 | --ddp_timeout 72000 \
36 | --save_steps 7777 \
37 | --dataloader_num_workers 1 \
38 | --preprocessing_num_workers 12 \
39 | --inference_batch_size_per_device 1 \
40 | --collection_strategy "local" \
41 | --raft_batch_size 1024 \
42 | --output_min_length 96 \
43 | --top_reward_percentage 0.125 \
44 | | tee ${log_dir}/raft_align.log \
45 | 2> ${log_dir}/raft_align.err
46 |
--------------------------------------------------------------------------------
/src/lmflow/models/text_regression_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """
4 | A model maps "text_only" data to float.
5 | """
6 |
7 | from lmflow.models.regression_model import RegressionModel
8 | from lmflow.datasets.dataset import Dataset
9 |
10 |
11 | class TextRegressionModel(RegressionModel):
12 | r"""
13 | Initializes a TextRegressionModel instance.
14 |
15 | Parameters
16 | ------------
17 |
18 | model_args :
19 | Model arguments such as model name, path, revision, etc.
20 |
21 | args : Optional.
22 | Positional arguments.
23 |
24 | kwargs : Optional.
25 | Keyword arguments.
26 | """
27 |
28 | def __init__(
29 | self,
30 | model_args,
31 | *args,
32 | **kwargs
33 | ):
34 | """
35 | Initializes a TextRegressionModel instance.
36 | :param model_args: dictionary with model arguments such as model name, path, revision, etc.
37 | """
38 | self.inference_func = None
39 |
40 |
41 | def register_inference_function(self, inference_func):
42 | """
43 | Registers a regression function.
44 | """
45 | self.inference_func = inference_func
46 |
47 |
48 | def inference(self, inputs: Dataset):
49 | """
50 | Gets regression results of a given dataset.
51 |
52 | :inputs: Dataset object, only accept type "text_only".
53 | """
54 | if self.inference_func is not None:
55 | return self.inference_func(inputs)
56 | else:
57 | pass
58 |
--------------------------------------------------------------------------------
/scripts/data_preprocess/run_data_preprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Run this shell script under project directory
3 |
4 | # For sample.py
5 | python scripts/data_preprocess/sample.py \
6 | --dataset_path ./data/example_dataset/train/train_50.json \
7 | --output_path ./data/example_dataset/train/train_50_sample.json \
8 | --ratio 0.5
9 |
10 | # For shuffle.py
11 | python scripts/data_preprocess/shuffle.py \
12 | --dataset_path ./data/example_dataset/train/train_50_sample.json \
13 | --output_path ./data/example_dataset/train/train_50_sample_shuffle.json
14 |
15 | # For merge.py : you can specify multiple files to merge
16 | python scripts/data_preprocess/merge.py \
17 | --dataset_path ./data/example_dataset/train/train_50.json \
18 | --merge_from_path ./data/example_dataset/train/train_50_sample_shuffle.json \
19 | ./data/example_dataset/train/train_50_sample.json \
20 | --output_path ./data/example_dataset/train/train_merge.json \
21 |
22 | # For concat.py: if you simply want to merge multiple files or a directory, use following.
23 | # You can also specify multiple files after --merge_from_path
24 | python scripts/data_preprocess/concat.py \
25 | --merge_from_path ./data/example_dataset/train/*.json \
26 | --output_path ./data/example_dataset/train/train_merge.json \
27 |
28 | # For concat_shuffle_split.py: if you simply want to merge multiple files or a directory, use following.
29 | python scripts/data_preprocess/concat_shuffle_split.py \
30 | --merge_from_path ./data/example_dataset/train/*.json \
31 | --output_path ./data/processed_dataset/ \
--------------------------------------------------------------------------------
/scripts/run_all_benchmark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | help_message="./$(basename $0)"
4 | help_message+=" --model_name_or_path MODEL_NAME_OR_PATH"
5 |
6 | if [ $# -ge 1 ]; then
7 | extra_args="$@"
8 | fi
9 |
10 | model_name_or_path=""
11 | while [[ $# -ge 1 ]]; do
12 | key="$1"
13 | case ${key} in
14 | -h|--help)
15 | printf "${help_message}" 1>&2
16 | return 0
17 | ;;
18 | --model_name_or_path)
19 | model_name_or_path="$2"
20 | shift
21 | ;;
22 | *)
23 | # Ignores unknown options
24 | esac
25 | shift
26 | done
27 |
28 | model_name=$(echo "${model_name_or_path}" | sed "s/\//--/g")
29 | echo ${model_name}
30 |
31 | if [[ "${model_name}" = "" ]]; then
32 | echo "no model name specified" 1>&2
33 | exit 1
34 | fi
35 |
36 | log_dir=output_dir/${model_name}_lmflow_chat_nll_eval
37 | mkdir -p ${log_dir}
38 | echo "[Evaluating] Evaluate on LMFlow_chat"
39 | ./scripts/run_benchmark.sh ${extra_args} --dataset_name lmflow_chat_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
40 |
41 | log_dir=output_dir/${model_name}_all_nll_eval
42 | mkdir -p ${log_dir}
43 | echo "[Evaluating] Evaluate on [commonsense, wiki, instruction_following (gpt4) ] nll evaluation"
44 | ./scripts/run_benchmark.sh ${extra_args} --dataset_name all_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
45 |
46 | log_dir=output_dir/${model_name}_commonsense_qa_eval
47 | mkdir -p ${log_dir}
48 | echo "[Evaluating] Evaluate on commonsense QA Accuracy evaluation"
49 | ./scripts/run_benchmark.sh ${extra_args} --dataset_name commonsense_qa_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
--------------------------------------------------------------------------------
/scripts/data_preprocess/count.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """
5 | Counts number of instances in a dataset.
6 | """
7 | from __future__ import absolute_import
8 |
9 | import argparse
10 | import json
11 | import random
12 | import sys
13 | import textwrap
14 |
15 | def parse_argument(sys_argv):
16 | """Parses arguments from command line.
17 | Args:
18 | sys_argv: the list of arguments (strings) from command line.
19 | Returns:
20 | A struct whose member corresponds to the required (optional) variable.
21 | For example,
22 | ```
23 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
24 | args.input # 'a.txt'
25 | args.num # 10
26 | ```
27 | """
28 | parser = argparse.ArgumentParser(
29 | formatter_class=argparse.RawTextHelpFormatter)
30 |
31 | # Training parameters
32 | parser.add_argument(
33 | "--dataset_path", type=str,
34 | default=None,
35 | help="input dataset path, reads from stdin by default"
36 | )
37 |
38 | # Parses from commandline
39 | args = parser.parse_args(sys_argv[1:])
40 |
41 | return args
42 |
43 |
44 | def main():
45 | args = parse_argument(sys.argv)
46 | if args.dataset_path is not None:
47 | with open(args.dataset_path, "r") as fin:
48 | data_dict = json.load(fin)
49 | else:
50 | data_dict = json.load(sys.stdin)
51 |
52 | num_instances = len(data_dict["instances"])
53 | print(num_instances)
54 |
55 |
56 | if __name__ == "__main__":
57 | main()
58 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | from setuptools import find_packages
3 | from setuptools import setup
4 | import subprocess
5 |
6 | folder = os.path.dirname(__file__)
7 | version_path = os.path.join(folder, "src", "lmflow", "version.py")
8 |
9 | __version__ = None
10 | with open(version_path) as f:
11 | exec(f.read(), globals())
12 |
13 | req_path = os.path.join(folder, "requirements.txt")
14 | install_requires = []
15 | if os.path.exists(req_path):
16 | with open(req_path) as fp:
17 | install_requires = [line.strip() for line in fp]
18 |
19 | readme_path = os.path.join(folder, "README.md")
20 | readme_contents = ""
21 | if os.path.exists(readme_path):
22 | with open(readme_path, encoding='utf-8') as fp:
23 | readme_contents = fp.read().strip()
24 |
25 | setup(
26 | name="lmflow",
27 | version=__version__,
28 | description="LMFlow: Large Model Flow.",
29 | author="The LMFlow Team",
30 | long_description=readme_contents,
31 | long_description_content_type="text/markdown",
32 | package_dir={"": "src"},
33 | packages=find_packages("src"),
34 | package_data={},
35 | install_requires=install_requires,
36 | classifiers=[
37 | "Intended Audience :: Science/Research/Engineering",
38 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
39 | "Programming Language :: Python :: 3.9",
40 | "Programming Language :: Python :: 3.10",
41 | ],
42 | requires_python=">=3.9",
43 | )
44 |
45 | # Must be called after all dependency installed, since flash-attn setup.py
46 | # relies on torch, packaging, etc.
47 | try:
48 | gpu_state = subprocess.check_output(["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"])
49 | if b"A100" in gpu_state:
50 | subprocess.call(["pip", "install", "flash-attn==1.0.4"])
51 | except:
52 | pass
53 |
--------------------------------------------------------------------------------
/tests/models/test_auto_model.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from lmflow.args import ModelArguments
4 | from lmflow.models.auto_model import AutoModel
5 | from lmflow.models.hf_decoder_model import HFDecoderModel
6 | from lmflow.models.text_regression_model import TextRegressionModel
7 | from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
8 |
9 | MODEL_NAME = "gpt2"
10 |
11 |
12 | class AutoModelTest(unittest.TestCase):
13 |
14 | def test_get_decoder_model(self):
15 | model_args = ModelArguments(
16 | arch_type="decoder_only", model_name_or_path=MODEL_NAME)
17 | model = AutoModel.get_model(model_args)
18 | self.assertTrue(isinstance(model, HFDecoderModel))
19 |
20 |
21 | # This unit test is commented out since the encoder decoder model has not been fully implemented
22 | '''
23 | def test_get_text_regression_model(self):
24 | model_args = ModelArguments(
25 | arch_type="text_regression", model_name_or_path=MODEL_NAME)
26 | model = AutoModel.get_model(model_args)
27 | self.assertTrue(isinstance(model, TextRegressionModel))
28 | '''
29 |
30 |
31 | # This unit test is commented out since the encoder decoder model has not been fully implemented
32 | '''
33 | def test_get_encoder_decoder(self):
34 | model_args = ModelArguments(
35 | arch_type="encoder_decoder", model_name_or_path=MODEL_NAME)
36 | model = AutoModel.get_model(model_args)
37 | self.assertTrue(isinstance(model, HFEncoderDecoderModel))
38 | '''
39 |
40 |
41 | def test_get_unsupported_model(self):
42 | model_args = ModelArguments(
43 | arch_type="unsupported model", model_name_or_path=MODEL_NAME)
44 | with self.assertRaises(NotImplementedError):
45 | model = AutoModel.get_model(model_args)
46 |
--------------------------------------------------------------------------------
/examples/evaluation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """A one-line summary of the module or program, terminated by a period.
5 |
6 | Leave one blank line. The rest of this docstring should contain an
7 | overall description of the module or program. Optionally, it may also
8 | contain a brief description of exported classes and functions and/or usage
9 | examples.
10 |
11 | Typical usage example:
12 |
13 | foo = ClassFoo()
14 | bar = foo.FunctionBar()
15 | """
16 | import json
17 | import os
18 | import sys
19 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
20 | from transformers import HfArgumentParser
21 |
22 | from lmflow.datasets.dataset import Dataset
23 | from lmflow.pipeline.auto_pipeline import AutoPipeline
24 | from lmflow.models.auto_model import AutoModel
25 | from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
26 |
27 |
28 | pipeline_name = "evaluator"
29 | PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
30 |
31 | parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
32 | model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
33 |
34 | with open (pipeline_args.deepspeed, "r") as f:
35 | ds_config = json.load(f)
36 |
37 | model = AutoModel.get_model(
38 | model_args,
39 | tune_strategy='none',
40 | ds_config=ds_config,
41 | use_accelerator=pipeline_args.use_accelerator_for_evaluator
42 | )
43 | dataset = Dataset(data_args)
44 |
45 | evaluator = AutoPipeline.get_pipeline(
46 | pipeline_name=pipeline_name,
47 | model_args=model_args,
48 | data_args=data_args,
49 | pipeline_args=pipeline_args,
50 | )
51 | evaluator.evaluate(model=model, dataset=dataset, metric=pipeline_args.metric)
52 |
--------------------------------------------------------------------------------
/docs/source/examples/medical_finetune.md:
--------------------------------------------------------------------------------
1 | # Finetune
2 |
3 | ```python
4 | import sys
5 |
6 | from transformers import HfArgumentParser
7 |
8 | from lmflow.args import (
9 | ModelArguments,
10 | DatasetArguments,
11 | AutoArguments,
12 | )
13 |
14 | from lmflow.datasets.dataset import Dataset
15 | from lmflow.models.tunable_models import TunableModel
16 | from lmflow.pipeline.auto_pipeline import AutoPipeline
17 |
18 |
19 | def main():
20 | # Parses arguments
21 | pipeline_name = "finetuner"
22 | PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
23 |
24 | parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
25 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
26 | # If we pass only one argument to the script and it's the path to a json file,
27 | # let's parse it to get our arguments.
28 | model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
29 | else:
30 | model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
31 |
32 | # TODO: deepspeed config initialization
33 |
34 | # Initialization
35 | finetuner = AutoPipeline.get_pipeline(
36 | pipeline_name=pipeline_name,
37 | model_args=model_args,
38 | data_args=data_args,
39 | pipeline_args=pipeline_args,
40 | )
41 | dataset = Dataset(data_args)
42 | model = TunableModel(model_args)
43 |
44 | # Tokenization and text grouping must be done in the main process
45 | with pipeline_args.main_process_first(desc="dataset map tokenization"):
46 | tokenized_dataset = model.tokenize(dataset)
47 | lm_dataset = finetuner.group_text(
48 | tokenized_dataset,
49 | model_max_length=model.get_max_length(),
50 | )
51 |
52 | # Finetuning
53 | tuned_model = finetuner.tune(model=model, lm_dataset=lm_dataset)
54 |
55 | ```
56 |
--------------------------------------------------------------------------------
/utils/make_delta.py:
--------------------------------------------------------------------------------
1 | """
2 | Make the delta weights by subtracting base weights.
3 |
4 | Usage:
5 | python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1
6 | """
7 | import argparse
8 |
9 | import torch
10 | from tqdm import tqdm
11 | from transformers import AutoTokenizer, AutoModelForCausalLM
12 |
13 |
14 | def make_delta(base_model_path, target_model_path, delta_path):
15 | print(f"Loading the base model from {base_model_path}")
16 | base = AutoModelForCausalLM.from_pretrained(
17 | base_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
18 | )
19 |
20 | print(f"Loading the target model from {target_model_path}")
21 | target = AutoModelForCausalLM.from_pretrained(
22 | target_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
23 | )
24 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False)
25 |
26 | print("Calculating the delta")
27 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
28 | assert name in base.state_dict()
29 | param.data -= base.state_dict()[name]
30 |
31 | print(f"Saving the delta to {delta_path}")
32 | if args.hub_repo_id:
33 | kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id}
34 | else:
35 | kwargs = {}
36 | target.save_pretrained(delta_path, **kwargs)
37 | target_tokenizer.save_pretrained(delta_path, **kwargs)
38 |
39 |
40 | if __name__ == "__main__":
41 | parser = argparse.ArgumentParser()
42 | parser.add_argument("--base-model-path", type=str, required=True)
43 | parser.add_argument("--target-model-path", type=str, required=True)
44 | parser.add_argument("--delta-path", type=str, required=True)
45 | parser.add_argument("--hub-repo-id", type=str)
46 | args = parser.parse_args()
47 |
48 | make_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 |
--------------------------------------------------------------------------------
/scripts/run_vis_chatbot_gradio_minigpt4.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | model=Salesforce/blip2-flan-t5-xxl
4 |
5 | # if [ ! -f output_models/pretrained_minigpt4_7b.pth ]; then
6 | # cd output_models && ./download.sh minigpt4_7b && cd -
7 | # fi
8 | #
9 | # if [ ! -f output_models/pretrained_minigpt4_7b_converted.pth ]; then
10 | # python utils/convert_minigpt4_checkpoints.py \
11 | # --model_path output_models/pretrained_minigpt4_7b.pth \
12 | # --save_path output_models/pretrained_minigpt4_7b_converted.pth
13 | # fi
14 | #
15 | # deepspeed --master_port=11005 examples/vis_chatbot_gradio.py \
16 | # --model_name_or_path ${model} \
17 | # --deepspeed configs/ds_config_multimodal.json \
18 | # --arch_type vision_encoder_decoder \
19 | # --task vqa \
20 | # --custom_model \
21 | # --prompt_format mini_gpt \
22 | # --prompt_structure "###Human: {input_text}###Assistant:" \
23 | # --llm_model_name_or_path LMFlow/Full-Robin-7b-v2 \
24 | # --checkpoint_path output_models/pretrained_minigpt4_7b_converted.pth \
25 | # --low_resource True \
26 | # --max_new_tokens 1024
27 |
28 | if [ ! -f output_models/pretrained_minigpt4_13b.pth ]; then
29 | cd output_models && ./download.sh minigpt4_13b && cd -
30 | fi
31 |
32 | if [ ! -f output_models/pretrained_minigpt4_13b_converted.pth ]; then
33 | python utils/convert_minigpt4_checkpoints.py \
34 | --model_path output_models/pretrained_minigpt4_13b.pth \
35 | --save_path output_models/pretrained_minigpt4_13b_converted.pth
36 | fi
37 |
38 | deepspeed --master_port=11005 examples/vis_chatbot_gradio.py \
39 | --model_name_or_path ${model} \
40 | --deepspeed configs/ds_config_multimodal.json \
41 | --arch_type vision_encoder_decoder \
42 | --task vqa \
43 | --custom_model \
44 | --prompt_format mini_gpt \
45 | --prompt_structure "###Human: {input_text}###Assistant:" \
46 | --llm_model_name_or_path LMFlow/Full-Robin-13b-v2 \
47 | --checkpoint_path output_models/pretrained_minigpt4_13b_converted.pth \
48 | --low_resource True \
49 | --max_new_tokens 1024
50 |
--------------------------------------------------------------------------------
/examples/finetune.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """A one-line summary of the module or program, terminated by a period.
5 |
6 | Leave one blank line. The rest of this docstring should contain an
7 | overall description of the module or program. Optionally, it may also
8 | contain a brief description of exported classes and functions and/or usage
9 | examples.
10 |
11 | Typical usage example:
12 |
13 | foo = ClassFoo()
14 | bar = foo.FunctionBar()
15 | """
16 |
17 | import sys
18 | import os
19 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
20 | from transformers import HfArgumentParser
21 |
22 | from lmflow.args import (
23 | ModelArguments,
24 | DatasetArguments,
25 | AutoArguments,
26 | )
27 |
28 | from lmflow.datasets.dataset import Dataset
29 | from lmflow.models.auto_model import AutoModel
30 | from lmflow.pipeline.auto_pipeline import AutoPipeline
31 |
32 |
33 | def main():
34 | # Parses arguments
35 | pipeline_name = "finetuner"
36 | PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
37 |
38 | parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
39 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
40 | # If we pass only one argument to the script and it's the path to a json file,
41 | # let's parse it to get our arguments.
42 | model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
43 | else:
44 | model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
45 |
46 | # Initialization
47 | finetuner = AutoPipeline.get_pipeline(
48 | pipeline_name=pipeline_name,
49 | model_args=model_args,
50 | data_args=data_args,
51 | pipeline_args=pipeline_args,
52 | )
53 | dataset = Dataset(data_args)
54 | model = AutoModel.get_model(model_args)
55 |
56 | # Finetuning
57 | tuned_model = finetuner.tune(model=model, dataset=dataset)
58 |
59 |
60 | if __name__ == '__main__':
61 | main()
62 |
--------------------------------------------------------------------------------
/tests/datasets/test_dataset.py:
--------------------------------------------------------------------------------
1 | #!/bin/env/python3
2 | # coding=utf-8
3 | """A one-line summary of the module or program, terminated by a period.
4 |
5 | Leave one blank line. The rest of this docstring should contain an
6 | overall description of the module or program. Optionally, it may also
7 | contain a brief description of exported classes and functions and/or usage
8 | examples.
9 |
10 | Typical usage example:
11 |
12 | foo = ClassFoo()
13 | bar = foo.FunctionBar()
14 | """
15 | from __future__ import absolute_import
16 | import unittest
17 |
18 | import json
19 | import os
20 | from pathlib import Path
21 |
22 | from lmflow.args import DatasetArguments
23 | from lmflow.datasets.dataset import Dataset
24 |
25 |
26 | class DatasetTest(unittest.TestCase):
27 |
28 | def test_init(self):
29 | dataset_dir = 'data/example_dataset/train'
30 | data_args = DatasetArguments(
31 | dataset_path=dataset_dir
32 | )
33 | dataset = Dataset(data_args, backend='huggingface')
34 | hf_dataset = dataset.get_backend_dataset()
35 |
36 | with open(os.path.join(Path(dataset_dir), 'train_50.json'), 'r') as fin:
37 | json_obj = json.load(fin)
38 | for i in range(len(hf_dataset)):
39 | self.assertEqual(json_obj['instances'][i], hf_dataset[i])
40 |
41 |
42 | def test_create_from_dict(self):
43 | data_dict = {
44 | "type": "text2text",
45 | "instances": [
46 | { "input": "INPUT 1", "output": "OUTPUT 1" },
47 | { "input": "INPUT 2", "output": "OUTPUT 2" },
48 | ]
49 | }
50 | dataset = Dataset.create_from_dict(data_dict)
51 | self.assertEqual(dataset.to_dict(), data_dict)
52 |
53 |
54 | def test_create_from_dict_bad_type(self):
55 | data_dict = {
56 | "type": "non-supported",
57 | "instances": [
58 | { "input": "INPUT 1", "output": "OUTPUT 1" },
59 | { "input": "INPUT 2", "output": "OUTPUT 2" },
60 | ]
61 | }
62 | with self.assertRaises(ValueError):
63 | dataset = Dataset.create_from_dict(data_dict)
64 |
--------------------------------------------------------------------------------
/docs/source/_static/logo5.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experimental/RAFT-diffusion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate==0.18.0
2 | asttokens==2.2.1
3 | backcall==0.2.0
4 | bitsandbytes==0.37.2
5 | certifi==2022.12.7
6 | charset-normalizer==3.1.0
7 | clip==1.0==
8 | cmake==3.26.1
9 | comm==0.1.3
10 | contourpy==1.0.7
11 | cycler==0.11.0
12 | debugpy==1.6.7
13 | decorator==5.1.1
14 | diffusers==0.14.0
15 | executing==1.2.0
16 | filelock==3.11.0
17 | fonttools==4.39.3
18 | ftfy==6.1.1
19 | huggingface-hub==0.13.4
20 | idna==3.4
21 | importlib-metadata==6.2.0
22 | importlib-resources==5.12.0
23 | ipykernel==6.22.0
24 | ipython==8.12.0
25 | jedi==0.18.2
26 | Jinja2==3.1.2
27 | jupyter_client==8.1.0
28 | jupyter_core==5.3.0
29 | kiwisolver==1.4.4
30 | lit==16.0.0
31 | MarkupSafe==2.1.2
32 | matplotlib==3.7.1
33 | matplotlib-inline==0.1.6
34 | mpmath==1.3.0
35 | mypy-extensions==1.0.0
36 | nest-asyncio==1.5.6
37 | networkx==3.1
38 | numpy==1.24.2
39 | nvidia-cublas-cu11==11.10.3.66
40 | nvidia-cuda-cupti-cu11==11.7.101
41 | nvidia-cuda-nvrtc-cu11==11.7.99
42 | nvidia-cuda-runtime-cu11==11.7.99
43 | nvidia-cudnn-cu11==8.5.0.96
44 | nvidia-cufft-cu11==10.9.0.58
45 | nvidia-curand-cu11==10.2.10.91
46 | nvidia-cusolver-cu11==11.4.0.1
47 | nvidia-cusparse-cu11==11.7.4.91
48 | nvidia-nccl-cu11==2.14.3
49 | nvidia-nvtx-cu11==11.7.91
50 | open-clip-torch==2.16.0
51 | packaging==23.0
52 | pandas==2.0.0
53 | parso==0.8.3
54 | pexpect==4.8.0
55 | pickleshare==0.7.5
56 | Pillow==9.5.0
57 | pip==23.0.1
58 | platformdirs==3.2.0
59 | prompt-toolkit==3.0.38
60 | protobuf==3.20.3
61 | psutil==5.9.4
62 | ptyprocess==0.7.0
63 | pure-eval==0.2.2
64 | Pygments==2.14.0
65 | pyparsing==3.0.9
66 | pyre-extensions==0.0.23
67 | python-dateutil==2.8.2
68 | pytz==2023.3
69 | PyYAML==6.0
70 | pyzmq==25.0.2
71 | regex==2023.3.23
72 | requests==2.28.2
73 | sentencepiece==0.1.97
74 | setuptools==65.6.3
75 | six==1.16.0
76 | stack-data==0.6.2
77 | sympy==1.11.1
78 | timm==0.6.13
79 | tokenizers==0.13.3
80 | torch==2.0.0
81 | torchvision==0.15.1
82 | tornado==6.2
83 | tqdm==4.65.0
84 | traitlets==5.9.0
85 | transformers==4.27.4
86 | triton==2.0.0
87 | typing_extensions==4.5.0
88 | typing-inspect==0.8.0
89 | tzdata==2023.3
90 | urllib3==1.26.15
91 | wcwidth==0.2.6
92 | wheel==0.38.4
93 | xformers==0.0.18
94 | zipp==3.15.0
95 |
96 |
--------------------------------------------------------------------------------
/scripts/data_preprocess/shuffle.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """
5 | Samples a certain ratio of instances from a dataset.
6 | """
7 | from __future__ import absolute_import
8 |
9 | import argparse
10 | import json
11 | import random
12 | import sys
13 | import textwrap
14 |
15 | def parse_argument(sys_argv):
16 | """Parses arguments from command line.
17 | Args:
18 | sys_argv: the list of arguments (strings) from command line.
19 | Returns:
20 | A struct whose member corresponds to the required (optional) variable.
21 | For example,
22 | ```
23 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
24 | args.input # 'a.txt'
25 | args.num # 10
26 | ```
27 | """
28 | parser = argparse.ArgumentParser(
29 | formatter_class=argparse.RawTextHelpFormatter)
30 |
31 | # Training parameters
32 | parser.add_argument(
33 | "--dataset_path", type=str,
34 | default=None,
35 | help="input dataset path, reads from stdin by default"
36 | )
37 | parser.add_argument(
38 | "--output_path", type=str,
39 | default=None,
40 | help="output dataset path, writes to stdout by default"
41 | )
42 | parser.add_argument(
43 | "--seed", type=int, default=42,
44 | help="pseudorandom seed"
45 | )
46 |
47 | # Parses from commandline
48 | args = parser.parse_args(sys_argv[1:])
49 |
50 | return args
51 |
52 |
53 | def main():
54 | args = parse_argument(sys.argv)
55 | if args.dataset_path is not None:
56 | with open(args.dataset_path, "r") as fin:
57 | data_dict = json.load(fin)
58 | else:
59 | data_dict = json.load(sys.stdin)
60 |
61 | random.seed(args.seed)
62 | random.shuffle(data_dict["instances"])
63 |
64 | if args.output_path is not None:
65 | with open(args.output_path, "w") as fout:
66 | json.dump(data_dict, fout, indent=4, ensure_ascii=False)
67 | else:
68 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
69 |
70 |
71 | if __name__ == "__main__":
72 | main()
73 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | # -- Project information -----------------------------------------------------
7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
8 |
9 | project = 'LMFlow'
10 | copyright = 'LMFlow 2023'
11 | author = 'The LMFlow Team'
12 |
13 | import sys
14 | import os
15 | sys.path.insert(0,os.path.abspath('../..'))
16 |
17 |
18 | # -- General configuration ---------------------------------------------------
19 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
20 |
21 | extensions = []
22 |
23 | templates_path = ['_templates']
24 | exclude_patterns = []
25 |
26 | extensions = [
27 | "sphinx.ext.autodoc",
28 | "sphinx.ext.autosummary",
29 | "sphinx.ext.todo",
30 | "sphinx.ext.viewcode",
31 | 'myst_parser',
32 | 'autoapi.extension',
33 | #"sphinxext.rediraffe",
34 | "sphinx_design",
35 | #"sphinx_copybutton",
36 | # For extension examples and demos
37 | #"ablog",
38 | "matplotlib.sphinxext.plot_directive",
39 | #"myst_nb",
40 | # "nbsphinx", # Uncomment and comment-out MyST-NB for local testing purposes.
41 | "numpydoc",
42 | #"sphinx_togglebutton",
43 | #"sphinx_favicon",
44 | ]
45 |
46 | autosummary_generate = True
47 |
48 | autoapi_type = 'python'
49 | autoapi_dirs = ['../../src']
50 |
51 | html_theme_options = {
52 | "header_links_before_dropdown": 4,
53 | "icon_links": [
54 | {
55 | "name": "LMFlow",
56 | "url": "https://github.com/OptimalScale/LMFlow",
57 | "icon": "_static/logo5.svg",
58 | "type": "local",
59 | "attributes": {"target": "_blank"},
60 | },
61 | ],
62 | "logo": {
63 | "text": "LMFlow",
64 | "image_dark": "_static/logo5.svg",
65 | "alt_text": "LMFlow",
66 | },
67 | }
68 |
69 |
70 | # -- Options for HTML output -------------------------------------------------
71 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
72 |
73 | html_theme = 'pydata_sphinx_theme'
74 | html_static_path = ['_static']
75 |
--------------------------------------------------------------------------------
/scripts/data_preprocess/sample.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """
5 | Samples a certain ratio of instances from a dataset.
6 | """
7 | from __future__ import absolute_import
8 |
9 | import argparse
10 | import json
11 | import random
12 | import sys
13 | import textwrap
14 |
15 | def parse_argument(sys_argv):
16 | """Parses arguments from command line.
17 | Args:
18 | sys_argv: the list of arguments (strings) from command line.
19 | Returns:
20 | A struct whose member corresponds to the required (optional) variable.
21 | For example,
22 | ```
23 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
24 | args.input # 'a.txt'
25 | args.num # 10
26 | ```
27 | """
28 | parser = argparse.ArgumentParser(
29 | formatter_class=argparse.RawTextHelpFormatter)
30 |
31 | # Training parameters
32 | parser.add_argument(
33 | "--dataset_path", type=str,
34 | default=None,
35 | help="input dataset path, reads from stdin by default"
36 | )
37 | parser.add_argument(
38 | "--output_path", type=str,
39 | default=None,
40 | help="output dataset path, writes to stdout by default"
41 | )
42 | parser.add_argument(
43 | "--ratio", type=float, required=True,
44 | help="sample ratio, will be floored if number of samples is not a int"
45 | )
46 | parser.add_argument(
47 | "--seed", type=int, default=42,
48 | help="pseudorandom seed"
49 | )
50 |
51 | # Parses from commandline
52 | args = parser.parse_args(sys_argv[1:])
53 |
54 | return args
55 |
56 |
57 | def main():
58 | args = parse_argument(sys.argv)
59 | if args.dataset_path is not None:
60 | with open(args.dataset_path, "r") as fin:
61 | data_dict = json.load(fin)
62 | else:
63 | data_dict = json.load(sys.stdin)
64 |
65 | random.seed(args.seed)
66 | num_instances = len(data_dict["instances"])
67 | num_sample = int(num_instances * args.ratio)
68 |
69 | data_dict["instances"] = random.sample(data_dict["instances"], num_sample)
70 |
71 | if args.output_path is not None:
72 | with open(args.output_path, "w") as fout:
73 | json.dump(data_dict, fout, indent=4, ensure_ascii=False)
74 | else:
75 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
76 |
77 |
78 | if __name__ == "__main__":
79 | main()
80 |
--------------------------------------------------------------------------------
/docs/source/_static/logo4.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scripts/data_preprocess/add_prompt.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """
5 | Adds prompt structure to a text2text dataset.
6 | """
7 | from __future__ import absolute_import
8 |
9 | import argparse
10 | import json
11 | import textwrap
12 | import sys
13 |
14 | def parse_argument(sys_argv):
15 | """Parses arguments from command line.
16 | Args:
17 | sys_argv: the list of arguments (strings) from command line.
18 | Returns:
19 | A struct whose member corresponds to the required (optional) variable.
20 | For example,
21 | ```
22 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
23 | args.input # 'a.txt'
24 | args.num # 10
25 | ```
26 | """
27 | parser = argparse.ArgumentParser(
28 | formatter_class=argparse.RawTextHelpFormatter)
29 |
30 | # Training parameters
31 | parser.add_argument(
32 | "--dataset_path", type=str,
33 | default=None,
34 | help=textwrap.dedent("input dataset path, reads from stdin by default")
35 | )
36 | parser.add_argument(
37 | "--output_path", type=str,
38 | default=None,
39 | help=textwrap.dedent("output dataset path, writes to stdout by default")
40 | )
41 | parser.add_argument(
42 | "--prompt_structure", type=str,
43 | default="{input}",
44 | help=textwrap.dedent("prompt structure to augment input")
45 | )
46 |
47 | # Parses from commandline
48 | args = parser.parse_args(sys_argv[1:])
49 |
50 | return args
51 |
52 |
53 | def main():
54 | args = parse_argument(sys.argv)
55 | if args.dataset_path is not None:
56 | with open(args.dataset_path, "r") as fin:
57 | data_dict = json.load(fin)
58 | else:
59 | data_dict = json.load(sys.stdin)
60 |
61 | if data_dict["type"] != "text2text":
62 | raise NotImplementedError(
63 | "only support text2text prompt augmentation"
64 | )
65 |
66 | data_dict["instances"] = [
67 | {
68 | "input": args.prompt_structure.format(input=instance["input"]),
69 | "output": instance["output"],
70 | }
71 | for instance in data_dict["instances"]
72 | ]
73 | if args.output_path is not None:
74 | with open(args.output_path, "w") as fout:
75 | json.dump(data_dict, fout, indent=4, ensure_ascii=False)
76 | else:
77 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
78 |
79 |
80 | if __name__ == "__main__":
81 | main()
82 |
--------------------------------------------------------------------------------
/scripts/data_preprocess/concat.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """
5 | Merges an extra dataset into current dataset.
6 | """
7 | from __future__ import absolute_import
8 |
9 | import argparse
10 | import json
11 | import textwrap
12 | import sys
13 |
14 | def parse_argument(sys_argv):
15 | """Parses arguments from command line.
16 | Args:
17 | sys_argv: the list of arguments (strings) from command line.
18 | Returns:
19 | A struct whose member corresponds to the required (optional) variable.
20 | For example,
21 | ```
22 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
23 | args.input # 'a.txt'
24 | args.num # 10
25 | ```
26 | """
27 | parser = argparse.ArgumentParser(
28 | formatter_class=argparse.RawTextHelpFormatter)
29 |
30 | # Training parameters
31 | parser.add_argument(
32 | "--output_path", type=str,
33 | default=None,
34 | help=textwrap.dedent("output dataset path, writes to stdout by default")
35 | )
36 | parser.add_argument(
37 | "--merge_from_path", type=str,
38 | nargs="+",
39 | help=textwrap.dedent(
40 | "dataset path of the extra dataset that will be merged"
41 | " into input dataset"
42 | )
43 | )
44 |
45 | # Parses from commandline
46 | args = parser.parse_args(sys_argv[1:])
47 |
48 | return args
49 |
50 |
51 | def main():
52 | args = parse_argument(sys.argv)
53 |
54 | if args.merge_from_path is not None:
55 | for i in range(0, len(args.merge_from_path)):
56 | with open(args.merge_from_path[i], "r") as fin:
57 | extra_data_dict = json.load(fin)
58 | if i == 0:
59 | data_dict = extra_data_dict
60 | else:
61 | if data_dict["type"] != extra_data_dict["type"]:
62 | raise ValueError(
63 | 'two dataset have different types:'
64 | f' input dataset: "{data_dict["type"]}";'
65 | f' merge from dataset: "{extra_data_dict["type"]}"'
66 | )
67 | data_dict["instances"].extend(extra_data_dict["instances"])
68 | else:
69 | raise ValueError("No merge files specified")
70 |
71 | if args.output_path is not None:
72 | with open(args.output_path, "w") as fout:
73 | json.dump(data_dict, fout, indent=4, ensure_ascii=False)
74 | else:
75 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
76 |
77 |
78 | if __name__ == "__main__":
79 | main()
80 |
--------------------------------------------------------------------------------
/tests/pipeline/test_auto_pipeline.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from lmflow.args import DatasetArguments
4 | from lmflow.args import EvaluatorArguments
5 | from lmflow.args import FinetunerArguments
6 | from lmflow.args import InferencerArguments
7 | from lmflow.args import ModelArguments
8 | from lmflow.args import RaftAlignerArguments
9 | from lmflow.pipeline.auto_pipeline import AutoPipeline
10 | from lmflow.pipeline.evaluator import Evaluator
11 | from lmflow.pipeline.finetuner import Finetuner
12 | from lmflow.pipeline.inferencer import Inferencer
13 | from lmflow.pipeline.raft_aligner import RaftAligner
14 |
15 | MODEL_NAME = "gpt2"
16 |
17 |
18 | class AutoPipelineTest(unittest.TestCase):
19 |
20 | def test_get_evaluator_pipeline(self):
21 | model_args = ModelArguments(model_name_or_path=MODEL_NAME)
22 | dataset_args = DatasetArguments()
23 | evaluator_args = EvaluatorArguments()
24 | pipeline = AutoPipeline.get_pipeline(
25 | "evaluator", model_args, dataset_args, evaluator_args)
26 |
27 | self.assertTrue(isinstance(pipeline, Evaluator))
28 |
29 | def test_get_finetuner_pipeline(self):
30 | model_args = ModelArguments(model_name_or_path=MODEL_NAME)
31 | dataset_args = DatasetArguments()
32 | finetuner_args = FinetunerArguments(output_dir="~/tmp")
33 | pipeline = AutoPipeline.get_pipeline(
34 | "finetuner", model_args, dataset_args, finetuner_args)
35 |
36 | self.assertTrue(isinstance(pipeline, Finetuner))
37 |
38 | def test_get_inferencer_pipeline(self):
39 | model_args = ModelArguments(model_name_or_path=MODEL_NAME)
40 | dataset_args = DatasetArguments()
41 | inferencer_args = InferencerArguments()
42 | pipeline = AutoPipeline.get_pipeline(
43 | "inferencer", model_args, dataset_args, inferencer_args)
44 |
45 | self.assertTrue(isinstance(pipeline, Inferencer))
46 |
47 | def test_get_raft_aligner_pipeline(self):
48 | model_args = ModelArguments(model_name_or_path=MODEL_NAME)
49 | dataset_args = DatasetArguments()
50 | raft_aligner_args = RaftAlignerArguments(output_dir="~/tmp")
51 | pipeline = AutoPipeline.get_pipeline(
52 | "raft_aligner", model_args, dataset_args, raft_aligner_args)
53 |
54 | self.assertTrue(isinstance(pipeline, RaftAligner))
55 |
56 | def test_get_unsupported_pipeline(self):
57 | model_args = ModelArguments(model_name_or_path=MODEL_NAME)
58 | dataset_args = DatasetArguments()
59 |
60 | with self.assertRaisesRegex(NotImplementedError, "Pipeline \"unsupported\" is not supported"):
61 | pipeline = AutoPipeline.get_pipeline(
62 | "unsupported", model_args, dataset_args, None)
63 |
--------------------------------------------------------------------------------
/scripts/data_preprocess/add_end_mark.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """
5 | Adds prompt structure to a text2text dataset.
6 | """
7 | from __future__ import absolute_import
8 |
9 | import argparse
10 | import json
11 | import textwrap
12 | import sys
13 |
14 | def parse_argument(sys_argv):
15 | """Parses arguments from command line.
16 | Args:
17 | sys_argv: the list of arguments (strings) from command line.
18 | Returns:
19 | A struct whose member corresponds to the required (optional) variable.
20 | For example,
21 | ```
22 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
23 | args.input # 'a.txt'
24 | args.num # 10
25 | ```
26 | """
27 | parser = argparse.ArgumentParser(
28 | formatter_class=argparse.RawTextHelpFormatter)
29 |
30 | # Training parameters
31 | parser.add_argument(
32 | "--dataset_path", type=str,
33 | default=None,
34 | help=textwrap.dedent("input dataset path, reads from stdin by default")
35 | )
36 | parser.add_argument(
37 | "--output_path", type=str,
38 | default=None,
39 | help=textwrap.dedent("output dataset path, writes to stdout by default")
40 | )
41 | parser.add_argument(
42 | "--end_mark", type=str,
43 | default="###",
44 | help=textwrap.dedent("end mark that append to the end of output")
45 | )
46 |
47 | # Parses from commandline
48 | args = parser.parse_args(sys_argv[1:])
49 |
50 | return args
51 |
52 |
53 | def main():
54 | args = parse_argument(sys.argv)
55 | if args.dataset_path is not None:
56 | with open(args.dataset_path, "r") as fin:
57 | data_dict = json.load(fin)
58 | else:
59 | data_dict = json.load(sys.stdin)
60 |
61 | output_field_map = {
62 | "text_only": "text",
63 | "text2text": "output",
64 | }
65 | data_dict_type = data_dict["type"]
66 | if not data_dict_type in output_field_map:
67 | raise NotImplementedError(
68 | "only support text_only or text2text dataset"
69 | )
70 |
71 | output_field = output_field_map[data_dict_type]
72 |
73 | num_instances = len(data_dict["instances"])
74 | for i in range(num_instances):
75 | data_dict["instances"][i][output_field] += args.end_mark
76 |
77 | if args.output_path is not None:
78 | with open(args.output_path, "w") as fout:
79 | json.dump(data_dict, fout, indent=4, ensure_ascii=False)
80 | else:
81 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
82 |
83 |
84 | if __name__ == "__main__":
85 | main()
86 |
--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | # Docker
2 |
3 | LMFlow is available as a docker image in Docker Hub, built from the Dockerfile
4 | in this directory, with cuda:11.3.0-cudnn8 (source docker:
5 | nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04). You need to have at least a
6 | Nvidia 3090 GPU on your machine with cuda driver compatible with cuda:11.3.0 to
7 | run this docker image.
8 |
9 | ## Install docker with nvidia support
10 |
11 | First you may need to install docker with nvidia support. This step requires
12 | root permission. If you don't have one, you may need to contact the system
13 | adminstrator to do that for you.
14 |
15 | We provide an example in Ubuntu 20.04. For other operating systems, you may
16 | refer to Nvidia's [Install
17 | Guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker).
18 |
19 | ```sh
20 | curl https://get.docker.com | sh && sudo systemctl --now enable docker
21 |
22 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
23 | && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
24 | | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
25 | && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list \
26 | | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
27 | | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
28 |
29 | sudo apt-get update
30 | sudo apt-get install -y nvidia-container-toolkit
31 | sudo nvidia-ctk runtime configure --runtime=docker
32 | sudo systemctl restart docker
33 | ```
34 |
35 | ## Pull docker image and run
36 |
37 | Use the following command to pull our docker image.
38 |
39 | ```sh
40 | docker pull optimalscale/lmflow
41 | ```
42 |
43 | The working directory in docker is `/LMFlow`, where LMFlow (commit:
44 | [fa0e66f94](https://github.com/OptimalScale/LMFlow/tree/fa0e66f94eb5b7bfd624afdf9826b054641e3373))
45 | is cloned and installed. Use the following command to enter the docker
46 | container, where `./LMFlow/log/finetune` in the container will be mapped to
47 | `./output_dir/log/finetune` on the host machine. You may add more directory
48 | mappings in a similar manner.
49 |
50 | ```sh
51 | docker run \
52 | -v ./output_dir/log/finetune:/LMFlow/log/finetune \
53 | --gpus=all \
54 | --shm-size=64g \
55 | -e WANDB_DISABLED=true \
56 | -it \
57 | --rm \
58 | optimalscale/lmflow \
59 | bash
60 | ```
61 |
62 | Then you will be able to work inside the docker, just like in a physical
63 | machine. Notice that to use multiple gpus, you need to allocate enough
64 | shared memory. We have setup the dependency for you, so you can directly
65 | run our scripts, e.g.
66 |
67 | ```
68 | ./scripts/run_chatbot.sh
69 | ./scripts/run_evaluation.sh
70 |
71 | # May need a GPU with --bf16 support, or you can remove --bf16
72 | # and use --fp16 instead
73 | ./scripts/run_finetune.sh
74 | ```
75 |
--------------------------------------------------------------------------------
/scripts/data_preprocess/merge.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """
5 | Merges an extra dataset into current dataset.
6 | """
7 | from __future__ import absolute_import
8 |
9 | import argparse
10 | import json
11 | import textwrap
12 | import sys
13 |
14 | def parse_argument(sys_argv):
15 | """Parses arguments from command line.
16 | Args:
17 | sys_argv: the list of arguments (strings) from command line.
18 | Returns:
19 | A struct whose member corresponds to the required (optional) variable.
20 | For example,
21 | ```
22 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
23 | args.input # 'a.txt'
24 | args.num # 10
25 | ```
26 | """
27 | parser = argparse.ArgumentParser(
28 | formatter_class=argparse.RawTextHelpFormatter)
29 |
30 | parser.add_argument(
31 | "--dataset_path", type=str,
32 | default=None,
33 | help=textwrap.dedent("input dataset path, reads from stdin by default")
34 | )
35 | # Training parameters
36 | parser.add_argument(
37 | "--output_path", type=str,
38 | default=None,
39 | help=textwrap.dedent("output dataset path, writes to stdout by default")
40 | )
41 | parser.add_argument(
42 | "--merge_from_path", type=str,
43 | nargs="+",
44 | help=textwrap.dedent(
45 | "dataset path of the extra dataset that will be merged"
46 | " into input dataset"
47 | )
48 | )
49 |
50 | # Parses from commandline
51 | args = parser.parse_args(sys_argv[1:])
52 |
53 | return args
54 |
55 |
56 | def main():
57 | args = parse_argument(sys.argv)
58 |
59 | if args.dataset_path is not None:
60 | with open(args.dataset_path, "r") as fin:
61 | data_dict = json.load(fin)
62 | else:
63 | data_dict = json.load(sys.stdin)
64 |
65 | if args.merge_from_path is not None:
66 | for i in range(0, len(args.merge_from_path)):
67 | with open(args.merge_from_path[i], "r") as fin:
68 | extra_data_dict = json.load(fin)
69 |
70 | if data_dict["type"] != extra_data_dict["type"]:
71 | raise ValueError(
72 | 'two dataset have different types:'
73 | f' input dataset: "{data_dict["type"]}";'
74 | f' merge from dataset: "{extra_data_dict["type"]}"'
75 | )
76 | data_dict["instances"].extend(extra_data_dict["instances"])
77 |
78 |
79 | if args.output_path is not None:
80 | with open(args.output_path, "w") as fout:
81 | json.dump(data_dict, fout, indent=4, ensure_ascii=False)
82 | else:
83 | json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
84 |
85 |
86 | if __name__ == "__main__":
87 | main()
88 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Initially taken from Github's Python gitignore file
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 | _build
8 |
9 | # C extensions
10 | *.so
11 |
12 | # tests and logs
13 | tests/fixtures/cached_*_text.txt
14 | logs/
15 | lightning_logs/
16 | lang_code_data/
17 | log/
18 | regression_test/*/new_output_models
19 | regression_test/*/new_log
20 | output_dir/
21 |
22 | # data files
23 | data/
24 |
25 | # output models
26 | output_models/
27 |
28 | # Distribution / packaging
29 | .Python
30 | build/
31 | develop-eggs/
32 | dist/
33 | downloads/
34 | eggs/
35 | .eggs/
36 | lib/
37 | lib64/
38 | parts/
39 | sdist/
40 | var/
41 | wheels/
42 | *.egg-info/
43 | .installed.cfg
44 | *.egg
45 | MANIFEST
46 |
47 | # PyInstaller
48 | # Usually these files are written by a python script from a template
49 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
50 | *.manifest
51 | *.spec
52 |
53 | # Installer logs
54 | pip-log.txt
55 | pip-delete-this-directory.txt
56 |
57 | # Unit test / coverage reports
58 | htmlcov/
59 | .tox/
60 | .nox/
61 | .coverage
62 | .coverage.*
63 | .cache
64 | nosetests.xml
65 | coverage.xml
66 | *.cover
67 | .hypothesis/
68 | .pytest_cache/
69 |
70 | # Translations
71 | *.mo
72 | *.pot
73 |
74 | # Django stuff:
75 | *.log
76 | local_settings.py
77 | db.sqlite3
78 |
79 | # Flask stuff:
80 | instance/
81 | .webassets-cache
82 |
83 | # Scrapy stuff:
84 | .scrapy
85 |
86 | # Sphinx documentation
87 | docs/_build/
88 |
89 | # PyBuilder
90 | target/
91 |
92 | # Jupyter Notebook
93 | .ipynb_checkpoints
94 |
95 | # IPython
96 | profile_default/
97 | ipython_config.py
98 |
99 | # pyenv
100 | .python-version
101 |
102 | # celery beat schedule file
103 | celerybeat-schedule
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 |
132 | # Pyre type checker
133 | .pyre/
134 |
135 | # vscode
136 | .vs
137 | .vscode
138 |
139 | # Pycharm
140 | .idea
141 |
142 | # TF code
143 | tensorflow_code
144 |
145 | # Models
146 | proc_data
147 |
148 | # examples
149 | runs
150 | /runs_old
151 | /wandb
152 | /examples/runs
153 | /examples/**/*.args
154 | /examples/rag/sweep
155 |
156 | # data
157 | # /data
158 | serialization_dir
159 |
160 | # emacs
161 | *.*~
162 | debug.env
163 |
164 | # vim
165 | .*.swp
166 |
167 | #ctags
168 | tags
169 |
170 | # pre-commit
171 | .pre-commit*
172 |
173 | # .lock
174 | *.lock
175 |
176 | # DS_Store (MacOS)
177 | .DS_Store
178 |
179 | # ruff
180 | .ruff_cache
181 |
182 | # lm_evaluation cache
183 | lm_cache/
184 |
--------------------------------------------------------------------------------
/docs/source/_static/logo6.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/utils/merge_tokenizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import argparse
5 | import logging
6 | import os
7 |
8 | from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
9 | import sentencepiece as spm
10 |
11 | import torch
12 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
13 |
14 | from transformers import AutoTokenizer
15 |
16 | logging.basicConfig(level=logging.INFO)
17 |
18 | if __name__ == '__main__':
19 |
20 | os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"
21 |
22 | parser = argparse.ArgumentParser()
23 | parser.add_argument('--tokenizer_dir', default='pinkmanlove/llama-7b-hf', type=str, required=False)
24 | parser.add_argument('--chinese_sp_model_file', default='./output_models/new_tokenizer/example.model', type=str)
25 | parser.add_argument('--output_dir', default='./output_models/merged_tokenizer', type=str, required=False)
26 | args = parser.parse_args()
27 |
28 | tokenizer_dir = args.tokenizer_dir
29 | chinese_sp_model_file = args.chinese_sp_model_file
30 | output_dir = args.output_dir
31 |
32 | # load
33 | old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
34 | chinese_sp_model = spm.SentencePieceProcessor()
35 | chinese_sp_model.Load(chinese_sp_model_file)
36 |
37 | old_spm = sp_pb2_model.ModelProto()
38 | old_spm.ParseFromString(old_tokenizer.sp_model.serialized_model_proto())
39 | chinese_spm = sp_pb2_model.ModelProto()
40 | chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto())
41 |
42 | ## Add Chinese tokens to old tokenizer
43 | old_spm_tokens_set=set(p.piece for p in old_spm.pieces)
44 | for p in chinese_spm.pieces:
45 | piece = p.piece
46 | if piece not in old_spm_tokens_set:
47 | new_p = sp_pb2_model.ModelProto().SentencePiece()
48 | new_p.piece = piece
49 | new_p.score = 0
50 | old_spm.pieces.append(new_p)
51 |
52 | ## Save
53 | output_sp_dir = output_dir + '/merged_tokenizer_sp'
54 | output_hf_dir = output_dir + '/merged_tokenizer_hf' # the path to save tokenizer
55 | os.makedirs(output_sp_dir,exist_ok=True)
56 | with open(output_sp_dir+'/merged_tokenizer.model', 'wb') as f:
57 | f.write(old_spm.SerializeToString())
58 |
59 | tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_dir,vocab_file=output_sp_dir+'/merged_tokenizer.model')
60 |
61 | tokenizer.save_pretrained(output_hf_dir)
62 | logging.info(f"Merged tokenizer has been saved to %s",output_dir)
63 |
64 |
65 | # Test
66 | old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
67 | new_tokenizer = AutoTokenizer.from_pretrained(output_hf_dir)
68 | logging.info(f"Old tokenizer vocab size: %d",len(old_tokenizer))
69 | logging.info(f"New tokenizer vocab size: %d",len(new_tokenizer))
70 |
71 | text='''白日依山尽,黄河入海流。欲穷千里目,更上一层楼。
72 | The primary use of LLaMA is research on large language models, including'''
73 | logging.info(f"Test text:\n %s",text)
74 | logging.info(f"Tokenized by LLaMA tokenizer:%s",old_tokenizer.tokenize(text))
75 | logging.info(f"Tokenized by Chinese-LLaMA tokenizer:%s",new_tokenizer.tokenize(text))
--------------------------------------------------------------------------------
/src/lmflow/pipeline/utils/peft_trainer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """Trainer for Peft models
4 | """
5 |
6 | from __future__ import absolute_import
7 | from transformers import Trainer
8 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
9 | from transformers.trainer_callback import (
10 | TrainerCallback,
11 | TrainerControl,
12 | TrainerState,
13 | )
14 | from transformers.training_args import TrainingArguments
15 | import os
16 | import numpy as np
17 |
18 | class PeftTrainer(Trainer):
19 | def _save_checkpoint(self, _, trial, metrics=None):
20 | """ Don't save base model, optimizer etc.
21 | but create checkpoint folder (needed for saving adapter) """
22 | checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
23 |
24 | run_dir = self._get_output_dir(trial=trial)
25 | output_dir = os.path.join(run_dir, checkpoint_folder)
26 |
27 | if metrics is not None and self.args.metric_for_best_model is not None:
28 | metric_to_check = self.args.metric_for_best_model
29 | if not metric_to_check.startswith("eval_"):
30 | metric_to_check = f"eval_{metric_to_check}"
31 | metric_value = metrics[metric_to_check]
32 |
33 | operator = np.greater if self.args.greater_is_better else np.less
34 | if (self.state.best_metric is None or self.state.best_model_checkpoint is None
35 | or operator(metric_value, self.state.best_metric)):
36 | self.state.best_metric = metric_value
37 |
38 | self.state.best_model_checkpoint = output_dir
39 |
40 | os.makedirs(output_dir, exist_ok=True)
41 |
42 | if self.args.should_save:
43 | self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
44 |
45 | class PeftSavingCallback(TrainerCallback):
46 | """ Correctly save PEFT model and not full model """
47 | def _save(self, model, folder):
48 | if folder is None:
49 | folder = ""
50 | peft_model_path = os.path.join(folder, "adapter_model")
51 | model.save_pretrained(peft_model_path)
52 |
53 | def on_train_end(self, args: TrainingArguments, state: TrainerState,
54 | control: TrainerControl, **kwargs):
55 | """ Save final best model adapter """
56 | self._save(kwargs['model'], state.best_model_checkpoint)
57 |
58 | def on_epoch_end(self, args: TrainingArguments, state: TrainerState,
59 | control: TrainerControl, **kwargs):
60 | """ Save intermediate model adapters in case of interrupted training """
61 | folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
62 | self._save(kwargs['model'], folder)
63 |
64 | def on_save(
65 | self,
66 | args: TrainingArguments,
67 | state: TrainerState,
68 | control: TrainerControl,
69 | **kwargs,
70 | ):
71 | checkpoint_folder = os.path.join(
72 | args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
73 | )
74 | self._save(kwargs['model'], checkpoint_folder)
75 |
76 | peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
77 | kwargs["model"].save_pretrained(peft_model_path)
78 | return control
--------------------------------------------------------------------------------
/docs/source/examples/DATASETS.md:
--------------------------------------------------------------------------------
1 | # Dataset
2 |
3 | We provide several available datasets under `data`. You may download them all by running:
4 | ```sh
5 | cd data && ./download.sh all && cd -
6 | ```
7 | You can replace `all` with a specific dataset name to only download that dataset (e.g. `./download.sh alpaca`).
8 |
9 | Customized datasets are strongly encouraged, since this way users can apply
10 | their own prompt engineering techniques over various source datasets. As long
11 | as the generated dataset following the format below, they can be accepted as
12 | the input of our pipelines :hugs:
13 |
14 |
15 | ## Dataset Format in General
16 |
17 | To specify the input for model finetune, users can provide a list of `.json`
18 | files under a specified dataset directory. For example,
19 |
20 | ```sh
21 | |- path_to_dataset
22 | |- data_1.json
23 | |- data_2.json
24 | |- another_data.json
25 | |- ...
26 | ```
27 |
28 | For inference, we currently only support a single `.json` file.
29 |
30 | Each json file shall have the following format (three instances with four keys
31 | for example),
32 |
33 | ```json
34 | {
35 | "type": "TYPE",
36 | "instances": [
37 | {
38 | "KEY_1": "VALUE_1.1",
39 | "KEY_2": "VALUE_1.2",
40 | "KEY_3": "VALUE_1.3",
41 | "KEY_4": "VALUE_1.4",
42 | },
43 | {
44 | "KEY_1": "VALUE_2.1",
45 | "KEY_2": "VALUE_2.2",
46 | "KEY_3": "VALUE_2.3",
47 | "KEY_4": "VALUE_2.4",
48 | },
49 | {
50 | "KEY_1": "VALUE_3.1",
51 | "KEY_2": "VALUE_3.2",
52 | "KEY_3": "VALUE_3.3",
53 | "KEY_4": "VALUE_3.4",
54 | },
55 | ]
56 | }
57 | ```
58 |
59 | where the `TYPE` indicates the dataset type and defines the set of keys
60 | `{ KEY_1, KEY_2, ... }` and their corresponding interpretations. The list of
61 | supported types are listed as follows.
62 |
63 | ## Supported Dataset and Detailed Formats
64 |
65 | ### TextOnly
66 |
67 | This is the most common dataset type, which only contains raw texts in each
68 | sample. This type of dataset can be used as the training set for text decoder
69 | models, or the input of decoder models / encoder-decoder models. Its format is
70 | as follows (three instances for example),
71 |
72 | ```json
73 | {
74 | "type": "text_only",
75 | "instances": [
76 | { "text": "SAMPLE_TEXT_1" },
77 | { "text": "SAMPLE_TEXT_2" },
78 | { "text": "SAMPLE_TEXT_3" },
79 | ]
80 | }
81 | ```
82 |
83 | For example, `data/example_dataset/train/train_50.json` has the aboved format.
84 |
85 | ### Text2Text
86 |
87 | This is the dataset type mostly used for inferencing, which contains a pair of
88 | texts in each sample. This type of dataset can be used as the training set for
89 | text encoder-decoder models, or question-answer pair for evaluating model
90 | inferences. Its format is as follows (three instances for example),
91 |
92 | ```json
93 | {
94 | "type": "text2text",
95 | "instances": [
96 | {
97 | "input": "SAMPLE_INPUT_1",
98 | "output": "SAMPLE_OUTPUT_1",
99 | },
100 | {
101 | "input": "SAMPLE_INPUT_2",
102 | "output": "SAMPLE_OUTPUT_2",
103 | },
104 | {
105 | "input": "SAMPLE_INPUT_3",
106 | "output": "SAMPLE_OUTPUT_3",
107 | },
108 | ]
109 | }
110 | ```
111 |
112 | For example, `data/example_dataset/test/test_13.json` has the aboved format.
113 |
--------------------------------------------------------------------------------
/src/lmflow/utils/position_interpolation/llama_rope_scaled_monkey_patch.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import transformers
3 | import transformers.models.llama.modeling_llama
4 |
5 |
6 | class ScaledRotaryEmbedding(torch.nn.Module):
7 | def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
8 | super().__init__()
9 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
10 | self.register_buffer("inv_freq", inv_freq)
11 |
12 | self.scale = 4
13 | max_position_embeddings = max_position_embeddings * self.scale
14 |
15 | # Build here to make `torch.jit.trace` work.
16 | self.max_seq_len_cached = max_position_embeddings
17 | t = torch.arange(
18 | self.max_seq_len_cached,
19 | device=self.inv_freq.device,
20 | dtype=self.inv_freq.dtype,
21 | )
22 |
23 |
24 | t /= self.scale
25 |
26 | freqs = torch.einsum("i,j->ij", t, self.inv_freq)
27 | # Different from paper, but it uses a different permutation in order to obtain the same calculation
28 | emb = torch.cat((freqs, freqs), dim=-1)
29 |
30 | #self.window = torch.hann_window(emb.shape[-1])
31 | self.register_buffer(
32 | "cos_cached", (emb.cos())[None, None, :, :], persistent=False
33 | )
34 | self.register_buffer(
35 | "sin_cached", (emb.sin())[None, None, :, :], persistent=False
36 | )
37 |
38 | def forward(self, x, seq_len=None):
39 | # x: [bs, num_attention_heads, seq_len, head_size]
40 | # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
41 | if seq_len > self.max_seq_len_cached:
42 | self.max_seq_len_cached = seq_len
43 | t = torch.arange(
44 | self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype
45 | )
46 | t *= self.scale
47 | freqs = torch.einsum("i,j->ij", t, self.inv_freq)
48 | # Different from paper, but it uses a different permutation in order to obtain the same calculation
49 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
50 | self.register_buffer(
51 | "cos_cached", emb.cos()[None, None, :, :], persistent=False
52 | )
53 | self.register_buffer(
54 | "sin_cached", emb.sin()[None, None, :, :], persistent=False
55 | )
56 | return (
57 | self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
58 | self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
59 | )
60 |
61 | old_init = transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.__init__
62 |
63 | def ntk_scaled_init(self, dim, max_position_embeddings=2048, base=10000, device=None):
64 | #The method is just these three lines
65 | a = 8 #Alpha value
66 | max_position_embeddings = max_position_embeddings * a
67 |
68 | base = base * a ** (dim / (dim-2)) #Base change formula
69 |
70 | old_init(self, dim, max_position_embeddings, base, device)
71 |
72 | def replace_llama_rope_with_scaled_rope():
73 | transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = (
74 | ScaledRotaryEmbedding
75 | )
76 |
77 | def repalce_llama_rope_init_with_scaled_rope_init():
78 | transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.__init__ = ntk_scaled_init
--------------------------------------------------------------------------------
/experimental/RAFT-diffusion/README.md:
--------------------------------------------------------------------------------
1 | # RAFT-Diffusion
2 |
3 |
4 | In this folder, we provide an example to show that how does RAFT work on diffusion models. We will also include these script into LMFlow framework into the LMFlow APIs in the future.
5 |
6 |
7 | The requirements are shown below.
8 | ```
9 | accelerate 0.18.0
10 | asttokens 2.2.1
11 | backcall 0.2.0
12 | bitsandbytes 0.37.2
13 | certifi 2022.12.7
14 | charset-normalizer 3.1.0
15 | clip 1.0
16 | cmake 3.26.1
17 | comm 0.1.3
18 | contourpy 1.0.7
19 | cycler 0.11.0
20 | debugpy 1.6.7
21 | decorator 5.1.1
22 | diffusers 0.14.0
23 | executing 1.2.0
24 | filelock 3.11.0
25 | fonttools 4.39.3
26 | ftfy 6.1.1
27 | huggingface-hub 0.13.4
28 | idna 3.4
29 | importlib-metadata 6.2.0
30 | importlib-resources 5.12.0
31 | ipykernel 6.22.0
32 | ipython 8.12.0
33 | jedi 0.18.2
34 | Jinja2 3.1.2
35 | jupyter_client 8.1.0
36 | jupyter_core 5.3.0
37 | kiwisolver 1.4.4
38 | lit 16.0.0
39 | MarkupSafe 2.1.2
40 | matplotlib 3.7.1
41 | matplotlib-inline 0.1.6
42 | mpmath 1.3.0
43 | mypy-extensions 1.0.0
44 | nest-asyncio 1.5.6
45 | networkx 3.1
46 | numpy 1.24.2
47 | nvidia-cublas-cu11 11.10.3.66
48 | nvidia-cuda-cupti-cu11 11.7.101
49 | nvidia-cuda-nvrtc-cu11 11.7.99
50 | nvidia-cuda-runtime-cu11 11.7.99
51 | nvidia-cudnn-cu11 8.5.0.96
52 | nvidia-cufft-cu11 10.9.0.58
53 | nvidia-curand-cu11 10.2.10.91
54 | nvidia-cusolver-cu11 11.4.0.1
55 | nvidia-cusparse-cu11 11.7.4.91
56 | nvidia-nccl-cu11 2.14.3
57 | nvidia-nvtx-cu11 11.7.91
58 | open-clip-torch 2.16.0
59 | packaging 23.0
60 | pandas 2.0.0
61 | parso 0.8.3
62 | pexpect 4.8.0
63 | pickleshare 0.7.5
64 | Pillow 9.5.0
65 | pip 23.0.1
66 | platformdirs 3.2.0
67 | prompt-toolkit 3.0.38
68 | protobuf 3.20.3
69 | psutil 5.9.4
70 | ptyprocess 0.7.0
71 | pure-eval 0.2.2
72 | Pygments 2.14.0
73 | pyparsing 3.0.9
74 | pyre-extensions 0.0.23
75 | python-dateutil 2.8.2
76 | pytz 2023.3
77 | PyYAML 6.0
78 | pyzmq 25.0.2
79 | regex 2023.3.23
80 | requests 2.28.2
81 | sentencepiece 0.1.97
82 | setuptools 65.6.3
83 | six 1.16.0
84 | stack-data 0.6.2
85 | sympy 1.11.1
86 | timm 0.6.13
87 | tokenizers 0.13.3
88 | torch 2.0.0
89 | torchvision 0.15.1
90 | tornado 6.2
91 | tqdm 4.65.0
92 | traitlets 5.9.0
93 | transformers 4.27.4
94 | triton 2.0.0
95 | typing_extensions 4.5.0
96 | typing-inspect 0.8.0
97 | tzdata 2023.3
98 | urllib3 1.26.15
99 | wcwidth 0.2.6
100 | wheel 0.38.4
101 | xformers 0.0.18
102 | zipp 3.15.0
103 | ```
104 |
105 | We will also add a COLAB link for convenience.
106 |
--------------------------------------------------------------------------------
/utils/lm_evaluator.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import logging
4 | import fnmatch
5 |
6 | from lm_eval import tasks, evaluator
7 |
8 | logging.getLogger("openai").setLevel(logging.WARNING)
9 |
10 |
11 | class MultiChoice:
12 | def __init__(self, choices):
13 | self.choices = choices
14 |
15 | # Simple wildcard support (linux filename patterns)
16 | def __contains__(self, values):
17 | for value in values.split(","):
18 | if len(fnmatch.filter(self.choices, value)) == 0:
19 | return False
20 |
21 | return True
22 |
23 | def __iter__(self):
24 | for choice in self.choices:
25 | yield choice
26 |
27 |
28 | def parse_args():
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument("--model", required=True)
31 | parser.add_argument("--model_args", default="")
32 | parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS))
33 | parser.add_argument("--provide_description", action="store_true")
34 | parser.add_argument("--num_fewshot", type=int, default=0)
35 | parser.add_argument("--batch_size", type=int, default=None)
36 | parser.add_argument("--device", type=str, default=None)
37 | parser.add_argument("--output_path", default=None)
38 | parser.add_argument("--limit", type=int, default=None)
39 | parser.add_argument("--no_cache", action="store_true")
40 | parser.add_argument("--decontamination_ngrams_path", default=None)
41 | parser.add_argument("--description_dict_path", default=None)
42 | parser.add_argument("--check_integrity", action="store_true")
43 |
44 | return parser.parse_args()
45 |
46 |
47 | # Returns a list containing all values of the source_list that
48 | # match at least one of the patterns
49 | def pattern_match(patterns, source_list):
50 | task_names = set()
51 | for pattern in patterns:
52 | for matching in fnmatch.filter(source_list, pattern):
53 | task_names.add(matching)
54 | return list(task_names)
55 |
56 |
57 | def main():
58 | args = parse_args()
59 |
60 | assert not args.provide_description # not implemented
61 |
62 | if args.limit:
63 | print(
64 | "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
65 | )
66 |
67 | if args.tasks is None:
68 | task_names = tasks.ALL_TASKS
69 | else:
70 | task_names = pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
71 |
72 | print(f"Selected Tasks: {task_names}")
73 |
74 | description_dict = {}
75 | if args.description_dict_path:
76 | with open(args.description_dict_path, "r") as f:
77 | description_dict = json.load(f)
78 |
79 | results = evaluator.simple_evaluate(
80 | model=args.model,
81 | model_args=args.model_args,
82 | tasks=task_names,
83 | num_fewshot=args.num_fewshot,
84 | batch_size=args.batch_size,
85 | device=args.device,
86 | no_cache=args.no_cache,
87 | limit=args.limit,
88 | description_dict=description_dict,
89 | decontamination_ngrams_path=args.decontamination_ngrams_path,
90 | check_integrity=args.check_integrity,
91 | )
92 |
93 | dumped = json.dumps(results, indent=2)
94 | print(dumped)
95 |
96 | if args.output_path:
97 | with open(args.output_path, "w") as f:
98 | f.write(dumped)
99 |
100 | print(
101 | f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
102 | f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
103 | )
104 | print(evaluator.make_table(results))
105 |
106 |
107 | if __name__ == "__main__":
108 | main()
109 |
--------------------------------------------------------------------------------
/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Tuple
2 |
3 | import torch
4 | import transformers
5 | from einops import rearrange
6 | from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
7 | from flash_attn.bert_padding import unpad_input, pad_input
8 |
9 | def _attn(self, query, key, value, attention_mask=None, head_mask=None):
10 | # (batch, head, seq_length, head_features)
11 | query = query.to(torch.bfloat16)
12 | key = key.to(torch.bfloat16)
13 | query = query * torch.sqrt(torch.tensor(self.head_dim))
14 | qkv = torch.stack(
15 | [query, key, value], dim=2
16 | )# [bsz, nh, 3, t, hd]
17 | qkv = qkv.transpose(1,3)## [bsz, q_len, 3, nh, hd]
18 | bsz = qkv.shape[0]
19 | q_len = qkv.shape[1]
20 |
21 | attention_mask = torch.where(attention_mask == -0.0, True, False)
22 | key_padding_mask = rearrange(attention_mask, "b () () s -> b s") if attention_mask is not None else None
23 | if key_padding_mask is None:
24 | qkv = rearrange(qkv, "b s ... -> (b s) ...")
25 | max_s = q_len
26 | cu_q_lens = torch.arange(
27 | 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
28 | )
29 | output = flash_attn_unpadded_qkvpacked_func(
30 | qkv, cu_q_lens, max_s, self.attn_dropout.p if self.training else 0.0 , softmax_scale=None, causal=True
31 | )# attention compute
32 | output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
33 | else:
34 | nheads = qkv.shape[-2]
35 | x = rearrange(qkv, "b s three h d -> b s (three h d)")
36 | x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
37 | x_unpad = rearrange(
38 | x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
39 | )
40 | output_unpad = flash_attn_unpadded_qkvpacked_func(
41 | x_unpad, cu_q_lens, max_s, self.attn_dropout.p if self.training else 0.0, softmax_scale=None, causal=True
42 | )
43 | output = rearrange(
44 | pad_input(
45 | rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
46 | ),
47 | "b s (h d) -> b s h d",
48 | h=nheads,
49 | )
50 |
51 | return output, None
52 |
53 | def forward(
54 | self,
55 | hidden_states,
56 | attention_mask=None,
57 | layer_past=None,
58 | head_mask=None,
59 | use_cache=False,
60 | output_attentions=False,
61 | ):
62 |
63 | assert head_mask is None, "head_mask is not supported"
64 | assert not output_attentions, "output_attentions is not supported"
65 | assert not use_cache, "use_cache is not supported"
66 |
67 | query = self.q_proj(hidden_states)
68 | key = self.k_proj(hidden_states)
69 | value = self.v_proj(hidden_states)
70 |
71 | query = self._split_heads(query, self.num_heads, self.head_dim)
72 | key = self._split_heads(key, self.num_heads, self.head_dim)
73 | value = self._split_heads(value, self.num_heads, self.head_dim)
74 |
75 | if layer_past is not None:
76 | past_key = layer_past[0]
77 | past_value = layer_past[1]
78 | key = torch.cat((past_key, key), dim=-2)
79 | value = torch.cat((past_value, value), dim=-2)
80 |
81 | present = None
82 | attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
83 | new_shape = attn_output.size()[:-2] + (self.num_heads * self.head_dim,)
84 | attn_output = attn_output.view(new_shape)
85 | attn_output = self.out_proj(attn_output)
86 | attn_output = self.resid_dropout(attn_output)
87 |
88 | outputs = (attn_output, present)
89 |
90 | return outputs # a, present, (attentions)
91 |
92 | def replace_gpt_neo_attn_with_flash_attn():
93 | transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention._attn = _attn
94 | transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention.forward = forward
--------------------------------------------------------------------------------
/docs/source/_static/logo.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/source/examples/TASK_GUIDE.md:
--------------------------------------------------------------------------------
1 | # LMFlow Benchmark Guide
2 |
3 | We support two ways to add evaluation settings in our repo, `NLL Task Setting` and `LM-Evaluation Task Setting`. Below are the details of them:
4 |
5 | # 1. NLL Task Setting
6 | Users can easily create new tasks and evaluate their datasets on
7 | the provide `nll (Negative Log Likelihood)` metric.
8 |
9 | ## Setup
10 |
11 | Fork the main repo, clone it, and create a new branch with the name of
12 | your task, and install the following:
13 |
14 | ```bash
15 | # After forking...
16 | git clone https://github.com//LMFlow.git
17 | cd LMFlow
18 | git checkout -b
19 | conda create -n lmflow python=3.9 -y
20 | conda activate lmflow
21 | conda install mpi4py
22 | pip install -e .
23 | ```
24 | ## Create Your Task Dataset File
25 | We provide several available datasets under `data` after running
26 | ```sh
27 | cd data && ./download.sh && cd -
28 | ```
29 |
30 | You can refer to some given evaluation dataset files and create your own.
31 | Also, you may refer to our guide on
32 | [DATASET](https://optimalscale.github.io/LMFlow/examples/DATASETS.html).
33 |
34 | In this step, you will need to decide your answer type like `text2text`
35 | or `text_only` (Notice that the current `nll` implementation only supports these
36 | two answer types). We will note the chosen answer type as ``.
37 |
38 | After preparing your own `DATASET` file, you can put it under `data` dir
39 | and make a `TASK` dir.
40 |
41 | ```bash
42 | mkdir
43 | mv
44 | ```
45 |
46 | ## Task Registration
47 |
48 | Note the path of your dataset, `data//`.
49 |
50 | Open the file `examples/benchmarking.py`, add your task's info into
51 | `LOCAL_DATSET_GROUP_MAP`, `LOCAL_DATSET_MAP`, `LOCAL_DATSET_ANSWERTYPE_MAP`
52 |
53 | In `LOCAL_DATSET_MAP`, you will need to specify your `DATASET` files' path:
54 |
55 | ```python
56 | LOCAL_DATSET_MAP ={
57 | "...":"...",
58 | "":"data//",
59 | }
60 | ```
61 |
62 | In `LOCAL_DATSET_ANSWERTYPE_MAP`, you will need to specify your task's
63 | ``:
64 |
65 | ```python
66 | LOCAL_DATSET_ANSWERTYPE_MAP ={
67 | "...":"...",
68 | "":",
69 | }
70 | ```
71 |
72 | If you only have one task, you can add key-value pair like `"":""`
73 | in `LOCAL_DATSET_GROUP_MAP`:
74 | ```python
75 | LOCAL_DATSET_GROUP_MAP ={
76 | "...":"...",
77 | "":"",
78 | }
79 | ```
80 |
81 |
82 | If you want to combine several tasks, you may first specify a
83 | combination name `` and add key-value pair like
84 | `"":",,.."`in `LOCAL_DATSET_GROUP_MAP`.
85 |
86 | Remember to separate TASK by `,`:
87 | ```python
88 | LOCAL_DATSET_GROUP_MAP ={
89 | "...":"...",
90 | "":",,..",
91 | }
92 | ```
93 |
94 | After finishing changing these items, you can run your own `` like:
95 |
96 | ```bash
97 | deepspeed examples/benchmarking.py \
98 | --answer_type \
99 | --use_ram_optimized_load False \
100 | --model_name_or_path ${model_name} \
101 | --dataset_name data//\
102 | --deepspeed examples/ds_config.json \
103 | --metric nll \
104 | --prompt_structure "###Human: {input}###Assistant:" \
105 | | tee ${log_dir}/train.log \
106 | 2> ${log_dir}/train.err
107 | ```
108 |
109 | # 2. LM-Evaluation Task Setting
110 |
111 | We integrate [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) into
112 | `benchamrk.py` by directly executing the evaluate commands. Users
113 | can also use their evaluation by simply changing two items in
114 | `` of `examples/benchmarking.py`.
115 |
116 | Please refer to Eleuther's
117 | [task-table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md)
118 | to get exact `` name.
119 |
120 | Similarly, you can combine several tasks, you may first specify a
121 | combination name `` and add key-value pair like
122 | `"":",,.."`in `LM_EVAL_DATASET_MAP`.
123 |
124 | Also, remember to separate TASK by `,`:
125 |
126 | ```python
127 | LM_EVAL_DATASET_MAP ={
128 | "...":"...",
129 | "":",,..",
130 | }
131 | ```
132 |
133 |
--------------------------------------------------------------------------------
/src/lmflow/utils/flash_attention/bloom_flash_attention.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Tuple, Union
2 |
3 | import torch
4 | from torch import nn
5 | import torch.nn.functional as F
6 |
7 | import transformers
8 | from transformers.models.bloom.modeling_bloom import dropout_add
9 |
10 | from einops import rearrange
11 |
12 | from .triton_flash_attention import flash_attn_qkvpacked_func
13 |
14 | def forward(
15 | self,
16 | hidden_states: torch.Tensor,
17 | residual: torch.Tensor,
18 | alibi: torch.Tensor,
19 | attention_mask: torch.Tensor,
20 | layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
21 | head_mask: Optional[torch.Tensor] = None,
22 | use_cache: bool = False,
23 | output_attentions: bool = False,
24 | ):
25 | dtype = hidden_states.dtype
26 | fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size]
27 |
28 | # 3 x [batch_size, seq_length, num_heads, head_dim]
29 | (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
30 |
31 | batch_size, q_length, _, _ = query_layer.shape
32 | bsz, q_len = batch_size, q_length
33 |
34 | if layer_past is not None:
35 | past_key, past_value = layer_past
36 | # concatenate along seq_length dimension:
37 | # - key: [batch_size * self.num_heads, head_dim, kv_length]
38 | # - value: [batch_size * self.num_heads, kv_length, head_dim]
39 | key_layer = torch.cat((past_key, key_layer), dim=2)
40 | value_layer = torch.cat((past_value, value_layer), dim=1)
41 |
42 | if use_cache is True:
43 | present = (key_layer, value_layer)
44 | else:
45 | present = None
46 |
47 | reshaped_alibi = rearrange(alibi, '(b h) one s-> b h one s', h = self.num_heads)
48 | reshaped_alibi = reshaped_alibi * self.beta
49 |
50 | attention_mask = (1.0 - attention_mask)
51 | attention_mask = attention_mask[:, None, None, :].bool()
52 | reshaped_alibi_masked = reshaped_alibi.masked_fill(attention_mask, -1e9)
53 |
54 | reshaped_query_layer = query_layer
55 | reshaped_key_layer = key_layer
56 | reshaped_value_layer = value_layer
57 |
58 | qkv = torch.concat([reshaped_query_layer.unsqueeze(2), reshaped_key_layer.unsqueeze(2), reshaped_value_layer.unsqueeze(2)], dim = 2)
59 |
60 | output = flash_attn_qkvpacked_func(
61 | qkv, reshaped_alibi_masked, True, self.inv_norm_factor
62 | )
63 |
64 | output = rearrange(output, 'b s h d -> (b h) s d')
65 |
66 | # change view [batch_size, num_heads, q_length, head_dim]
67 | context_layer = self._merge_heads(output)
68 |
69 | # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
70 | if self.pretraining_tp > 1 and self.slow_but_exact:
71 | slices = self.hidden_size / self.pretraining_tp
72 | output_tensor = torch.zeros_like(context_layer)
73 | for i in range(self.pretraining_tp):
74 | output_tensor = output_tensor + F.linear(
75 | context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
76 | self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
77 | )
78 | else:
79 | output_tensor = self.dense(context_layer)
80 |
81 | output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
82 |
83 | outputs = (output_tensor, present)
84 | if output_attentions:
85 | outputs += (context_layer,)
86 |
87 | return outputs
88 |
89 |
90 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
91 | # requires the attention mask to be the same as the key_padding_mask
92 | def _prepare_attn_mask(
93 | self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
94 | ) -> torch.BoolTensor:
95 |
96 | return attention_mask
97 |
98 | def replace_bloom_attn_with_flash_attn():
99 | transformers.models.bloom.modeling_bloom.BloomModel._prepare_attn_mask = (
100 | _prepare_attn_mask
101 | )
102 | transformers.models.bloom.modeling_bloom.BloomAttention.forward = forward
--------------------------------------------------------------------------------
/output_models/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | function main() {
4 | public_server="http://lmflow.org:5000"
5 | if [ $# -lt 1 -o "$1" = "-h" -o "$1" = "--help" ]; then
6 | echo "Usage: bash $(basename $0) model_name"
7 | echo "Example: bash $(basename $0) instruction_ckpt"
8 | echo "Example: bash $(basename $0) all"
9 | fi
10 |
11 | if [ "$1" = "llama7b-lora-medical" -o "$1" = "medical_ckpt" -o "$1" = "all" ]; then
12 | echo "downloading llama7b-lora-medical.tar.gz"
13 | filename='llama7b-lora-medical.tar.gz'
14 | wget ${public_server}/${filename}
15 | tar zxvf ${filename}
16 | rm ${filename}
17 | fi
18 |
19 | if [ "$1" = "llama13b-lora-medical" -o "$1" = "medical_ckpt" -o "$1" = "all" ]; then
20 | echo "downloading llama13b-lora-medical.tar.gz"
21 | filename='llama13b-lora-medical.tar.gz'
22 | wget ${public_server}/${filename}
23 | tar zxvf ${filename}
24 | rm ${filename}
25 | fi
26 |
27 | if [ "$1" = "llama30b-lora-medical" -o "$1" = "medical_ckpt" -o "$1" = "all" ]; then
28 | echo "downloading llama30b-lora-medical.tar.gz"
29 | filename='llama30b-lora-medical.tar.gz'
30 | wget ${public_server}/${filename}
31 | tar zxvf ${filename}
32 | rm ${filename}
33 | fi
34 |
35 | if [ "$1" = "llama7b-lora-170k" -o "$1" = "instruction_ckpt" -o "$1" = "all" ]; then
36 | echo "downloading llama7b-lora-170k.tar.gz"
37 | filename='llama7b-lora-170k.tar.gz'
38 | wget ${public_server}/${filename}
39 | tar zxvf ${filename}
40 | rm ${filename}
41 | fi
42 |
43 | if [ "$1" = "llama7b-lora-380k" -o "$1" = "instruction_ckpt" -o "$1" = "all" ]; then
44 | echo "downloading llama7b-lora-380k.tar.gz"
45 | filename='llama7b-lora-380k.tar.gz'
46 | wget ${public_server}/${filename}
47 | tar zxvf ${filename}
48 | rm ${filename}
49 | fi
50 |
51 | if [ "$1" = "llama13b-lora-170k" -o "$1" = "instruction_ckpt" -o "$1" = "all" ]; then
52 | echo "downloading llama13b-lora-170k.tar.gz"
53 | filename='llama13b-lora-170k.tar.gz'
54 | wget ${public_server}/${filename}
55 | tar zxvf ${filename}
56 | rm ${filename}
57 | fi
58 |
59 | if [ "$1" = "llama13b-lora-380k" -o "$1" = "instruction_ckpt" -o "$1" = "all" ]; then
60 | echo "downloading llama13b-lora-380k.tar.gz"
61 | filename='llama13b-lora-380k.tar.gz'
62 | wget ${public_server}/${filename}
63 | tar zxvf ${filename}
64 | rm ${filename}
65 | fi
66 |
67 | if [ "$1" = "llama30b-lora-170k" -o "$1" = "instruction_ckpt" -o "$1" = "all" ]; then
68 | echo "downloading llama30b-lora-170k.tar.gz"
69 | filename='llama30b-lora-170k.tar.gz'
70 | wget ${public_server}/${filename}
71 | tar zxvf ${filename}
72 | rm ${filename}
73 | fi
74 |
75 | if [ "$1" = "llama7b-lora-movie-reviewer" -o "$1" = "raft_ckpt" -o "$1" = "all" ]; then
76 | echo "downloading llama7b-lora-movie-reviewer"
77 | filename='llama7b-lora-movie-reviewer.tar.gz'
78 | wget ${public_server}/${filename}
79 | tar zxvf ${filename}
80 | rm ${filename}
81 | fi
82 |
83 | if [ "$1" = "cockatoo-7b" -o "$1" = "all" ]; then
84 | echo "downloading cockatoo-7b"
85 | filename='cockatoo-7b.tar.gz'
86 | wget ${public_server}/${filename}
87 | tar zxvf ${filename}
88 | rm ${filename}
89 | fi
90 |
91 | if [ "$1" = "parakeets-2.7b" -o "$1" = "all" ]; then
92 | echo "downloading parakeets-2.7b"
93 | filename='parakeets-2.7b.tar.gz'
94 | wget ${public_server}/${filename}
95 | tar zxvf ${filename}
96 | rm ${filename}
97 | fi
98 |
99 | if [ "$1" = "robin-7b" -o "$1" = "all" ]; then
100 | echo "downloading robin-7b"
101 | filename='robin-7b-v2-delta.tar.gz'
102 | wget ${public_server}/${filename}
103 | tar zxvf ${filename}
104 | rm ${filename}
105 | fi
106 |
107 | if [ "$1" = "minigpt4_7b" -o "$1" = "all" ]; then
108 | echo "downloading minigpt4_7b"
109 | filename='pretrained_minigpt4_7b.pth'
110 | wget ${public_server}/${filename}
111 | fi
112 |
113 | if [ "$1" = "minigpt4_13b" -o "$1" = "all" ]; then
114 | echo "downloading minigpt4_13b"
115 | filename='pretrained_minigpt4_13b.pth'
116 | wget ${public_server}/${filename}
117 | fi
118 | }
119 |
120 | main "$@"
121 |
--------------------------------------------------------------------------------
/src/lmflow/utils/flash_attention/llama_flash_attention.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Tuple
2 |
3 | import torch
4 | from torch import nn
5 |
6 | import transformers
7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
8 |
9 | from einops import rearrange
10 |
11 | from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
12 | from flash_attn.bert_padding import unpad_input, pad_input
13 |
14 |
15 | def forward(
16 | self,
17 | hidden_states: torch.Tensor,
18 | attention_mask: Optional[torch.Tensor] = None,
19 | position_ids: Optional[torch.Tensor] = None,
20 | past_key_value: Optional[Tuple[torch.Tensor]] = None,
21 | output_attentions: bool = False,
22 | use_cache: bool = False,
23 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
24 | """Input shape: Batch x Time x Channel
25 |
26 | attention_mask: [bsz, q_len]
27 | """
28 | bsz, q_len, _ = hidden_states.size()
29 |
30 | query_states = (
31 | self.q_proj(hidden_states)
32 | .view(bsz, q_len, self.num_heads, self.head_dim)
33 | .transpose(1, 2)
34 | )
35 | key_states = (
36 | self.k_proj(hidden_states)
37 | .view(bsz, q_len, self.num_heads, self.head_dim)
38 | .transpose(1, 2)
39 | )
40 | value_states = (
41 | self.v_proj(hidden_states)
42 | .view(bsz, q_len, self.num_heads, self.head_dim)
43 | .transpose(1, 2)
44 | )
45 | # [bsz, q_len, nh, hd]
46 | # [bsz, nh, q_len, hd]
47 |
48 | kv_seq_len = key_states.shape[-2]
49 | assert past_key_value is None, "past_key_value is not supported"
50 |
51 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
52 | query_states, key_states = apply_rotary_pos_emb(
53 | query_states, key_states, cos, sin, position_ids
54 | )
55 | # [bsz, nh, t, hd]
56 | assert not output_attentions, "output_attentions is not supported"
57 | assert not use_cache, "use_cache is not supported"
58 |
59 | # Flash attention codes from
60 | # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
61 |
62 | # transform the data into the format required by flash attention
63 | qkv = torch.stack(
64 | [query_states, key_states, value_states], dim=2
65 | ) # [bsz, nh, 3, q_len, hd]
66 | qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
67 | # We have disabled _prepare_decoder_attention_mask in LlamaModel
68 | # the attention_mask should be the same as the key_padding_mask
69 | key_padding_mask = attention_mask
70 |
71 | if key_padding_mask is None:
72 | qkv = rearrange(qkv, "b s ... -> (b s) ...")
73 | max_s = q_len
74 | cu_q_lens = torch.arange(
75 | 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
76 | )
77 | output = flash_attn_unpadded_qkvpacked_func(
78 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
79 | )
80 | output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
81 | else:
82 | nheads = qkv.shape[-2]
83 | x = rearrange(qkv, "b s three h d -> b s (three h d)")
84 | x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
85 | x_unpad = rearrange(
86 | x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
87 | )
88 | output_unpad = flash_attn_unpadded_qkvpacked_func(
89 | x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
90 | )
91 | output = rearrange(
92 | pad_input(
93 | rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
94 | ),
95 | "b s (h d) -> b s h d",
96 | h=nheads,
97 | )
98 | return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, None
99 |
100 |
101 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
102 | # requires the attention mask to be the same as the key_padding_mask
103 | def _prepare_decoder_attention_mask(
104 | self, attention_mask, input_shape, inputs_embeds, past_key_values_length
105 | ):
106 | # [bsz, seq_len]
107 | return attention_mask
108 |
109 |
110 | def replace_llama_attn_with_flash_attn():
111 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
112 | _prepare_decoder_attention_mask
113 | )
114 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
--------------------------------------------------------------------------------
/docs/source/_static/logo2.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/lmflow/utils/constants.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """
4 | Commonly used constants.
5 | """
6 |
7 | TEXT_ONLY_DATASET_DESCRIPTION = (
8 | """
9 | "text_only": a dataset with only raw text instances, with following format:
10 |
11 | {
12 | "type": "text_only",
13 | "instances": [
14 | { "text": "TEXT_1" },
15 | { "text": "TEXT_2" },
16 | ...
17 | ]
18 | }
19 | """
20 | ).lstrip("\n")
21 |
22 |
23 | TEXT_ONLY_DATASET_DETAILS = (
24 | """
25 | For example,
26 |
27 | ```python
28 | from lmflow.datasets import Dataset
29 |
30 | data_dict = {
31 | "type": "text_only",
32 | "instances": [
33 | { "text": "Human: Hello. Bot: Hi!" },
34 | { "text": "Human: How are you today? Bot: Fine, thank you!" },
35 | ]
36 | }
37 | dataset = Dataset.create_from_dict(data_dict)
38 | ```
39 |
40 | You may also save the corresponding format to json,
41 | ```python
42 | import json
43 | from lmflow.args import DatasetArguments
44 | from lmflow.datasets import Dataset
45 |
46 | data_dict = {
47 | "type": "text_only",
48 | "instances": [
49 | { "text": "Human: Hello. Bot: Hi!" },
50 | { "text": "Human: How are you today? Bot: Fine, thank you!" },
51 | ]
52 | }
53 | with open("data.json", "w") as fout:
54 | json.dump(data_dict, fout)
55 |
56 | data_args = DatasetArgument(dataset_path="data.json")
57 | dataset = Dataset(data_args)
58 | new_data_dict = dataset.to_dict()
59 | # `new_data_dict` Should have the same content as `data_dict`
60 | ```
61 | """
62 | ).lstrip("\n")
63 |
64 |
65 | TEXT2TEXT_DATASET_DESCRIPTION = (
66 | """
67 | "text2text": a dataset with input & output instances, with following format:
68 |
69 | {
70 | "type": "text2text",
71 | "instances": [
72 | { "input": "INPUT_1", "output": "OUTPUT_1" },
73 | { "input": "INPUT_2", "output": "OUTPUT_2" },
74 | ...
75 | ]
76 | }
77 | """
78 | ).lstrip("\n")
79 |
80 |
81 | TEXT2TEXT_DATASET_DETAILS = (
82 | """
83 | For example,
84 |
85 | ```python
86 | from lmflow.datasets import Dataset
87 |
88 | data_dict = {
89 | "type": "text2text",
90 | "instances": [
91 | {
92 | "input": "Human: Hello.",
93 | "output": "Bot: Hi!",
94 | },
95 | {
96 | "input": "Human: How are you today?",
97 | "output": "Bot: Fine, thank you! And you?",
98 | }
99 | ]
100 | }
101 | dataset = Dataset.create_from_dict(data_dict)
102 | ```
103 |
104 | You may also save the corresponding format to json,
105 | ```python
106 | import json
107 | from lmflow.args import DatasetArguments
108 | from lmflow.datasets import Dataset
109 |
110 | data_dict = {
111 | "type": "text2text",
112 | "instances": [
113 | {
114 | "input": "Human: Hello.",
115 | "output": "Bot: Hi!",
116 | },
117 | {
118 | "input": "Human: How are you today?",
119 | "output": "Bot: Fine, thank you! And you?",
120 | }
121 | ]
122 | }
123 | with open("data.json", "w") as fout:
124 | json.dump(data_dict, fout)
125 |
126 | data_args = DatasetArgument(dataset_path="data.json")
127 | dataset = Dataset(data_args)
128 | new_data_dict = dataset.to_dict()
129 | # `new_data_dict` Should have the same content as `data_dict`
130 | ```
131 | """
132 | ).lstrip("\n")
133 |
134 |
135 | FLOAT_ONLY_DATASET_DESCRIPTION = (
136 | """
137 | "float_only": a dataset with only float instances, with following format:
138 |
139 | {
140 | "type": "float_only",
141 | "instances": [
142 | { "value": "FLOAT_1" },
143 | { "value": "FLOAT_2" },
144 | ...
145 | ]
146 | }
147 | """
148 | ).lstrip("\n")
149 |
150 |
151 | TEXT_ONLY_DATASET_LONG_DESCRITION = (
152 | TEXT_ONLY_DATASET_DESCRIPTION + TEXT_ONLY_DATASET_DETAILS
153 | )
154 |
155 | TEXT2TEXT_DATASET_LONG_DESCRITION = (
156 | TEXT2TEXT_DATASET_DESCRIPTION + TEXT2TEXT_DATASET_DETAILS
157 | )
158 |
159 |
160 | DATASET_DESCRIPTION_MAP = {
161 | "text_only": TEXT_ONLY_DATASET_DESCRIPTION,
162 | "text2text": TEXT2TEXT_DATASET_DESCRIPTION,
163 | "float_only": FLOAT_ONLY_DATASET_DESCRIPTION,
164 | }
165 |
166 | INSTANCE_FIELDS_MAP = {
167 | "text_only": ["text"],
168 | "text2text": ["input", "output"],
169 | "float_only": ["value"],
170 | "image_text": ["images", "text"],
171 | }
172 |
--------------------------------------------------------------------------------
/tests/utils/test_data_utils.py:
--------------------------------------------------------------------------------
1 | #!/bin/env/python3
2 | # coding=utf-8
3 | from __future__ import absolute_import
4 | import unittest
5 | from lmflow.utils.data_utils import load_data, batchlize, answer_extraction
6 | from lmflow.args import DatasetArguments
7 |
8 | groundtruth_inputs = ['The Transformer architecture [START_REF]',
9 | 'The Schwarzschild radius is defined as: \\[',
10 | 'A force of 0.6N is applied to an object, which accelerates at 3m/s. What is its mass? ',
11 | '[START_I_SMILES]',
12 | '[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords',
13 | 'The reason why Transformers replaced RNNs was because',
14 | 'Question: What is the notch signaling pathway?\n\nAnswer:',
15 | '# Multi-Head Attention\n\n',
16 | 'Title: Self-Supervised Learning, A Survey\n\nAuthors: John Smith\n\n',
17 | 'Lecture 1: The Ising Model\n\n',
18 | 'Information overload is a major obstacle to scientific progress. The explosive growth in scientific literature and data has made it ever harder to discover useful insights in a large mass of information. Today scientific knowledge is accessed through search engines, but they are unable to organize scientific knowledge alone. In this paper we introduce Galactica: a large language model that can store, combine and reason about scientific knowledge. We train on a large scientific corpus of papers, reference material, knowledge bases and many other sources. We outperform existing models on a range of scientific tasks. On technical knowledge probes such as LaTeX equations, Galactica outperforms the latest GPT-3 by 68.2% versus 49.0%. Galactica also performs well on reasoning, outperforming Chinchilla on mathematical MMLU by 41.3% to 35.7%, and PaLM 540B on MATH with a score of 20.4% versus 8.8%. It also sets a new state-of-the-art on downstream tasks such as PubMedQA and MedMCQA dev of 77.6% and 52.9%. And despite not being trained on a general corpus, Galactica outperforms BLOOM and OPT-175B on BIG-bench. We believe these results demonstrate the potential for language models as a new interface for science. We open source the model for the benefit of the scientific community.\n\nTLDR:',
19 | '[START_I_SMILES]C(C(=O)O)N[END_I_SMILES]\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for',
20 | 'what is the capital of US?',
21 | ]
22 |
23 | groundtruth_outputs = ["NA"] * 13
24 |
25 | mc_output = ['Answer: (C) Generation of free radicals',
26 | 'Answer: C Generation of free radicals',
27 | 'Answer: C',
28 | 'Answer: (C)',
29 | 'A: C',
30 | 'A: (C)',
31 | 'Output: (C) Generation of free radicals',
32 | 'Output: C Generation of free radicals',
33 | 'Output: C',
34 | 'Output: (C)',
35 | ]
36 |
37 | mc_answer = ['c'] * 10
38 |
39 | qa_output = ['Yes.',
40 | 'Answer: Yes',
41 | 'Answer: Yes.',
42 | 'Yes ',
43 | 'No.',
44 | 'Answer: No',
45 | 'Answer: No.',
46 | 'No ',
47 | 'Maybe.',
48 | 'Answer: Maybe',
49 | 'Answer: Maybe.',
50 | 'Maybe ',
51 | ]
52 | qa_answer = ['yes'] * 4 + ['no'] * 4 + ['maybe'] * 4
53 |
54 | class DataUtilsTest(unittest.TestCase):
55 | def test_load_data(self):
56 | file_name = "data/example_dataset/test/test_13.json"
57 |
58 | inputs, outputs, datasize = load_data(file_name=file_name)
59 | # Test for inputs
60 | for i in range(0,len(inputs)):
61 | self.assertEqual(inputs[i], groundtruth_inputs[i])
62 | # Test for outputs
63 | for i in range(0,len(outputs)):
64 | self.assertEqual(outputs[i], groundtruth_outputs[i])
65 | # Test for datasize
66 | self.assertEqual(datasize, 13)
67 |
68 | def test_batchlize(self):
69 | file_name = "data/example_dataset/test/test_13.json"
70 | inputs, outputs, datasize = load_data(file_name=file_name)
71 | dataset = []
72 | for idx in range(len(outputs)):
73 | dataset.append({"input":inputs[idx], "output":outputs[idx], "input_idx":idx})
74 | # TODO: add test for random shuffle case
75 | dataloader = batchlize(dataset, 4, random_shuffle= False)
76 | self.assertEqual(len(dataloader), 13 // 4 + 1)
77 |
78 | def test_answer_extraction(self):
79 | # Test for medmcqa dataset
80 | for i in range(0,len(mc_output)):
81 | self.assertEqual(answer_extraction(mc_output[i], answer_type="medmcqa"), mc_answer[i])
82 | # Test for usmle dataset
83 | for i in range(0,len(mc_output)):
84 | self.assertEqual(answer_extraction(mc_output[i], answer_type="usmle"), mc_answer[i])
85 | # Test for pubmedqa dataset
86 | for i in range(0,len(qa_output)):
87 | self.assertEqual(answer_extraction(qa_output[i], answer_type="pubmedqa"), qa_answer[i])
88 |
--------------------------------------------------------------------------------
/examples/chatbot.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """A simple shell chatbot implemented with lmflow APIs.
5 | """
6 | import logging
7 | import json
8 | import os
9 | import sys
10 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
11 | import warnings
12 |
13 | from dataclasses import dataclass, field
14 | from transformers import HfArgumentParser
15 | from typing import Optional
16 |
17 | from lmflow.datasets.dataset import Dataset
18 | from lmflow.pipeline.auto_pipeline import AutoPipeline
19 | from lmflow.models.auto_model import AutoModel
20 | from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
21 |
22 |
23 | logging.disable(logging.ERROR)
24 | warnings.filterwarnings("ignore")
25 |
26 |
27 | @dataclass
28 | class ChatbotArguments:
29 | prompt_structure: Optional[str] = field(
30 | default="{input_text}",
31 | metadata={
32 | "help": "prompt structure given user's input text"
33 | },
34 | )
35 | end_string: Optional[str] = field(
36 | default="\n\n",
37 | metadata={
38 | "help": "end string mark of the chatbot's output"
39 | },
40 | )
41 |
42 | def main():
43 | pipeline_name = "inferencer"
44 | PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
45 |
46 | parser = HfArgumentParser((
47 | ModelArguments,
48 | PipelineArguments,
49 | ChatbotArguments,
50 | ))
51 | model_args, pipeline_args, chatbot_args = (
52 | parser.parse_args_into_dataclasses()
53 | )
54 | inferencer_args = pipeline_args
55 |
56 | with open (pipeline_args.deepspeed, "r") as f:
57 | ds_config = json.load(f)
58 |
59 | model = AutoModel.get_model(
60 | model_args,
61 | tune_strategy='none',
62 | ds_config=ds_config,
63 | device=pipeline_args.device,
64 | )
65 |
66 | # We don't need input data, we will read interactively from stdin
67 | data_args = DatasetArguments(dataset_path=None)
68 | dataset = Dataset(data_args)
69 |
70 | inferencer = AutoPipeline.get_pipeline(
71 | pipeline_name=pipeline_name,
72 | model_args=model_args,
73 | data_args=data_args,
74 | pipeline_args=pipeline_args,
75 | )
76 |
77 | # Chats
78 | model_name = model_args.model_name_or_path
79 | if model_args.lora_model_path is not None:
80 | model_name += f" + {model_args.lora_model_path}"
81 |
82 | guide_message = (
83 | "\n"
84 | f"#############################################################################\n"
85 | f"## A {model_name} chatbot is now chatting with you!\n"
86 | f"#############################################################################\n"
87 | "\n"
88 | )
89 | print(guide_message)
90 |
91 | # context = (
92 | # "You are a helpful assistant who follows the given instructions"
93 | # " unconditionally."
94 | # )
95 | context = ""
96 |
97 | end_string = chatbot_args.end_string
98 | prompt_structure = chatbot_args.prompt_structure
99 |
100 | while True:
101 | input_text = input("User >>> ")
102 | if input_text == "exit":
103 | print("exit...")
104 | break
105 | elif input_text == "reset":
106 | context = ""
107 | print("Chat history cleared")
108 | continue
109 | if not input_text:
110 | input_text = " "
111 |
112 | context += prompt_structure.format(input_text=input_text)
113 | context = context[-model.get_max_length():] # Memory of the bot
114 |
115 | input_dataset = dataset.from_dict({
116 | "type": "text_only",
117 | "instances": [ { "text": context } ]
118 | })
119 |
120 | print("Bot: ", end="")
121 | print_index = 0
122 |
123 | token_per_step = 4
124 |
125 | for response, flag_break in inferencer.stream_inference(
126 | context=context,
127 | model=model,
128 | max_new_tokens=inferencer_args.max_new_tokens,
129 | token_per_step=token_per_step,
130 | temperature=inferencer_args.temperature,
131 | end_string=end_string,
132 | input_dataset=input_dataset
133 | ):
134 | # Prints characters in the buffer
135 | new_print_index = print_index
136 | for char in response[print_index:]:
137 | if end_string is not None and char == end_string[0]:
138 | if new_print_index + len(end_string) >= len(response):
139 | break
140 |
141 | new_print_index += 1
142 | print(char, end="", flush=True)
143 |
144 | print_index = new_print_index
145 |
146 | if flag_break:
147 | break
148 | print("\n", end="")
149 |
150 | context += response + "\n"
151 |
152 |
153 | if __name__ == "__main__":
154 | main()
155 |
--------------------------------------------------------------------------------
/docs/source/_static/logo3.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/service/app.py:
--------------------------------------------------------------------------------
1 | import json
2 | import torch
3 | import os
4 |
5 | from flask import Flask, request, stream_with_context
6 | from flask import render_template
7 | from flask_cors import CORS
8 | from accelerate import Accelerator
9 | from dataclasses import dataclass, field
10 | from transformers import HfArgumentParser
11 | from typing import Optional
12 |
13 | from lmflow.datasets.dataset import Dataset
14 | from lmflow.pipeline.auto_pipeline import AutoPipeline
15 | from lmflow.models.auto_model import AutoModel
16 | from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
17 |
18 | WINDOW_LENGTH = 512
19 |
20 | @dataclass
21 | class AppArguments:
22 | end_string: Optional[str] = field(
23 | default="##",
24 | metadata={
25 | "help": "end string mark of the chatbot's output"
26 | },
27 | )
28 | max_new_tokens: Optional[int] = field(
29 | default=200,
30 | metadata={
31 | "help": "maximum number of generated tokens"
32 | },
33 | )
34 |
35 | parser = HfArgumentParser((
36 | ModelArguments,
37 | AppArguments,
38 | ))
39 |
40 | model_args, app_args = (
41 | parser.parse_args_into_dataclasses()
42 | )
43 |
44 | app = Flask(__name__)
45 | CORS(app)
46 | ds_config_path = "./examples/ds_config.json"
47 | with open (ds_config_path, "r") as f:
48 | ds_config = json.load(f)
49 |
50 |
51 | local_rank = int(os.getenv("LOCAL_RANK", "0"))
52 | world_size = int(os.getenv("WORLD_SIZE", "1"))
53 | torch.cuda.set_device(local_rank)
54 | model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config, use_accelerator=True)
55 | accelerator = Accelerator()
56 |
57 | def stream_generate(inputs,context_len = 1024, max_new_tokens=128, end_string="##"):
58 |
59 |
60 | max_src_len = context_len - max_new_tokens - len(end_string)
61 | input_ids = model.tokenizer(inputs).input_ids
62 | input_echo_len = len(input_ids)
63 | output_ids = list(input_ids)
64 | input_ids = input_ids[-max_src_len:]
65 |
66 | past_key_values = out = None
67 | flag_stop = False
68 | for i in range(0, max_new_tokens):
69 | with accelerator.autocast():
70 | if i == 0:
71 | with torch.no_grad():
72 | out = model.backend_model(torch.as_tensor([input_ids], device=local_rank), use_cache=True)
73 | logits = out.logits
74 | past_key_values = out.past_key_values
75 | else:
76 | with torch.no_grad():
77 | out = model.backend_model(
78 | input_ids=torch.as_tensor([[token]], device=local_rank),
79 | use_cache=True,
80 | past_key_values=past_key_values,
81 | )
82 | logits = out.logits
83 | past_key_values = out.past_key_values
84 |
85 | last_token_logits = logits[0, -1, :]
86 | token = int(torch.argmax(last_token_logits))
87 | output_ids.append(token)
88 |
89 | tmp_output_ids = output_ids[input_echo_len:]
90 |
91 | output = model.tokenizer.decode(
92 | tmp_output_ids,
93 | skip_special_tokens=True,
94 | spaces_between_special_tokens=False,
95 | )
96 |
97 | if end_string in output:
98 | index = output.index(end_string)
99 | output = output[:index]
100 | flag_stop = True
101 | yield output.replace("\ufffd","")
102 |
103 | if flag_stop == True:
104 | break
105 |
106 | @app.route('/predict',methods = ['POST'])
107 | def predict():
108 | if(request.method == "POST"):
109 | try:
110 | user_input = request.get_json()["Input"]
111 | conversation = request.get_json()["History"]
112 |
113 | history_input = ""
114 | if(len(conversation) >= 2):
115 | if(len(conversation) == 2):
116 | history_input ="###Human: " + user_input +" "
117 | else:
118 | for i in range(0, len(conversation)-1):
119 | if(i % 2 == 0):
120 | history_input = history_input + "###Human: " + conversation[i+1]["content"] + " "
121 | elif(i % 2 == 1):
122 | history_input = history_input + "###Assistant:" + conversation[i+1]["content"]
123 | history_input = history_input + "###Assistant:"
124 |
125 | if len(model.encode(history_input))> WINDOW_LENGTH:
126 | inputs = model.encode(history_input)
127 | inputs = inputs[-WINDOW_LENGTH:]
128 | history_input = model.decode(inputs)
129 |
130 | return app.response_class(stream_with_context(stream_generate(history_input,
131 | max_new_tokens=app_args.max_new_tokens,
132 | end_string=app_args.end_string)))
133 | except Exception as ex:
134 | print(ex)
135 | text_out = ex
136 | else:
137 | text_out = "Not POST Method"
138 | return text_out
139 |
140 | @app.route('/',methods = ['GET'])
141 | def login():
142 |
143 | return render_template('index.html')
144 |
145 |
146 | app.run(port = 5000, debug = False)
147 |
--------------------------------------------------------------------------------
/scripts/export_llama_state_dict_checkpoint.py:
--------------------------------------------------------------------------------
1 | # Export state dict for downstream inference, such as llama.cpp
2 |
3 | import json
4 | import os
5 |
6 | import torch
7 | import transformers
8 | from peft import PeftModel
9 | from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: E402
10 |
11 |
12 | def permute(w):
13 | return (
14 | w.view(n_heads, dim // n_heads // 2, 2, dim)
15 | .transpose(1, 2)
16 | .reshape(dim, dim)
17 | )
18 |
19 |
20 | def unpermute(w):
21 | return (
22 | w.view(n_heads, 2, dim // n_heads // 2, dim)
23 | .transpose(1, 2)
24 | .reshape(dim, dim)
25 | )
26 |
27 | def translate_state_dict_key(k): # noqa: C901
28 | k = k.replace("base_model.model.", "")
29 | if k == "model.embed_tokens.weight":
30 | return "tok_embeddings.weight"
31 | elif k == "model.norm.weight":
32 | return "norm.weight"
33 | elif k == "lm_head.weight":
34 | return "output.weight"
35 | elif k.startswith("model.layers."):
36 | layer = k.split(".")[2]
37 | if k.endswith(".self_attn.q_proj.weight"):
38 | return f"layers.{layer}.attention.wq.weight"
39 | elif k.endswith(".self_attn.k_proj.weight"):
40 | return f"layers.{layer}.attention.wk.weight"
41 | elif k.endswith(".self_attn.v_proj.weight"):
42 | return f"layers.{layer}.attention.wv.weight"
43 | elif k.endswith(".self_attn.o_proj.weight"):
44 | return f"layers.{layer}.attention.wo.weight"
45 | elif k.endswith(".mlp.gate_proj.weight"):
46 | return f"layers.{layer}.feed_forward.w1.weight"
47 | elif k.endswith(".mlp.down_proj.weight"):
48 | return f"layers.{layer}.feed_forward.w2.weight"
49 | elif k.endswith(".mlp.up_proj.weight"):
50 | return f"layers.{layer}.feed_forward.w3.weight"
51 | elif k.endswith(".input_layernorm.weight"):
52 | return f"layers.{layer}.attention_norm.weight"
53 | elif k.endswith(".post_attention_layernorm.weight"):
54 | return f"layers.{layer}.ffn_norm.weight"
55 | elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
56 | return None
57 | else:
58 | print(layer, k)
59 | raise NotImplementedError
60 | else:
61 | print(k)
62 | raise NotImplementedError
63 |
64 | PARAM_LIST = {
65 | 7:{
66 | "dim": 4096,
67 | "multiple_of": 256,
68 | "n_heads": 32,
69 | "n_layers": 32,
70 | "norm_eps": 1e-06,
71 | "vocab_size": -1,
72 | },
73 | 13:{
74 | "dim": 5120,
75 | "multiple_of": 256,
76 | "n_heads": 40,
77 | "n_layers": 40,
78 | "norm_eps": 1e-06,
79 | "vocab_size": -1,
80 | },
81 | 33:{
82 | "dim": 6656,
83 | "multiple_of": 256,
84 | "n_heads": 52,
85 | "n_layers": 60,
86 | "norm_eps": 1e-06,
87 | "vocab_size": -1,
88 | }}
89 |
90 |
91 | BASE_MODEL = os.environ.get("BASE_MODEL", None)
92 | assert (
93 | BASE_MODEL
94 | ), "Please specify a value for BASE_MODEL environment variable, e.g. `export BASE_MODEL=decapoda-research/llama-30b-hf`" # noqa: E501
95 | LORA_MODEL = os.environ.get("LORA_MODEL", None)
96 |
97 | MODEL_SIZE = int(os.environ.get("MODEL_SIZE", None))
98 | assert (
99 | MODEL_SIZE
100 | ), "Please specify a value for MODEL_SIZE environment variable, e.g. `export MODEL_SIZE=33`" # noqa: E501
101 |
102 |
103 | tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
104 |
105 | base_model = LlamaForCausalLM.from_pretrained(
106 | BASE_MODEL,
107 | load_in_8bit=False,
108 | torch_dtype=torch.float16,
109 | device_map={"": "cpu"},
110 | )
111 |
112 |
113 | params = PARAM_LIST[MODEL_SIZE]
114 |
115 | n_layers = params["n_layers"]
116 | n_heads = params["n_heads"]
117 | dim = params["dim"]
118 | dims_per_head = dim // n_heads
119 | base = 10000.0
120 | inv_freq = 1.0 / (
121 | base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
122 | )
123 |
124 | if not (LORA_MODEL is None):
125 | lora_model = PeftModel.from_pretrained(
126 | base_model,
127 | LORA_MODEL,
128 | device_map={"": "cpu"},
129 | torch_dtype=torch.float16,
130 | )
131 |
132 |
133 | # merge weights
134 | for layer in lora_model.base_model.model.model.layers:
135 | layer.self_attn.q_proj.merge_weights = True
136 | layer.self_attn.v_proj.merge_weights = True
137 |
138 | lora_model.train(False)
139 |
140 | lora_model_sd = lora_model.state_dict()
141 |
142 |
143 |
144 |
145 |
146 |
147 | new_state_dict = {}
148 | for k, v in lora_model_sd.items():
149 | new_k = translate_state_dict_key(k)
150 | if new_k is not None:
151 | if "wq" in new_k or "wk" in new_k:
152 | new_state_dict[new_k] = unpermute(v)
153 | else:
154 | new_state_dict[new_k] = v
155 | else:
156 | base_model.eval()
157 | new_state_dict = {}
158 | state_dicts = base_model.state_dict()
159 | for k, v in state_dicts.items():
160 | new_k = translate_state_dict_key(k)
161 | if new_k is not None:
162 | if "wq" in new_k or "wk" in new_k:
163 | new_state_dict[new_k] = unpermute(v)
164 | else:
165 | new_state_dict[new_k] = v
166 |
167 |
168 |
169 | os.makedirs("./ckpt", exist_ok=True)
170 |
171 | torch.save(new_state_dict, "./ckpt/consolidated.00.pth")
172 |
173 | with open("./ckpt/params.json", "w") as f:
174 | json.dump(params, f)
175 |
--------------------------------------------------------------------------------
/examples/raft_align.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """Alignment tuning example, such as RLHF."""
5 |
6 | import logging
7 | import os
8 | import sys
9 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
10 | from dataclasses import dataclass, field
11 | from typing import Optional
12 |
13 | from transformers import HfArgumentParser, pipeline, AutoTokenizer
14 |
15 | from lmflow.args import (
16 | ModelArguments,
17 | DatasetArguments,
18 | AutoArguments,
19 | )
20 |
21 | from lmflow.datasets.dataset import Dataset
22 | from lmflow.models.auto_model import AutoModel
23 | from lmflow.pipeline.auto_pipeline import AutoPipeline
24 |
25 |
26 | @dataclass
27 | class RewardArguments:
28 | reward_type: Optional[str] = field(
29 | default="hf_pipeline",
30 | metadata={
31 | "help": (
32 | "type of reward model, support huggingface pipeline. Will"
33 | " support \"customized\" torch.nn.modules in the future."
34 | ),
35 | },
36 | )
37 | reward_model_or_path: Optional[str] = field(
38 | default="weqweasdas/hh_rlhf_rm",
39 | metadata={
40 | "help": (
41 | "reward model name (huggingface) or its path"
42 | ),
43 | },
44 | )
45 | reward_task: Optional[str] = field(
46 | default="sentiment-analysis",
47 | metadata={
48 | "help": "type of reward task, such as sentiment-analysis, detoxic."
49 | },
50 | )
51 | reward_model_args: Optional[str] = field(
52 | default="return_all_scores=True, function_to_apply=\"none\", batch_size=1",
53 | metadata={
54 | "help": (
55 | "extra arguments required by different type of reward models."
56 | ),
57 | },
58 | )
59 |
60 |
61 | def get_reward_function(reward_args, pipeline_args):
62 | args = reward_args
63 | reward_type = args.reward_type
64 |
65 | if reward_type == "hf_pipeline":
66 |
67 | # GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
68 | # only for this model.
69 | rm_tokenizer = AutoTokenizer.from_pretrained(reward_args.reward_model_or_path)
70 | rm_tokenizer.pad_token = rm_tokenizer.eos_token
71 | rm_tokenizer.pad_token_id = rm_tokenizer.eos_token_id
72 | rm_tokenizer.padding_side = "left"
73 |
74 | hf_pipe = pipeline(
75 | reward_args.reward_task,
76 | model=reward_args.reward_model_or_path,
77 | device=f"cuda:{pipeline_args.local_rank}",
78 | tokenizer=rm_tokenizer
79 | )
80 | def reward_func(dataset: Dataset):
81 | if dataset.type != "text_only":
82 | raise NotImplementedError(
83 | "reward function only accept \"text_only\" datasets"
84 | )
85 | pipe_kwargs = {
86 | "return_all_scores": True,
87 | "function_to_apply": "none",
88 | "batch_size": 1
89 | }
90 |
91 | data_dict = dataset.to_dict()
92 | texts_for_rewards = [
93 | sample["text"] for sample in data_dict["instances"]
94 | ]
95 | pipe_outputs = hf_pipe(texts_for_rewards, **pipe_kwargs)
96 | rewards = [output[0]["score"] for output in pipe_outputs]
97 |
98 | reward_dataset = Dataset.create_from_dict({
99 | "type": "float_only",
100 | "instances": [
101 | { "value": reward } for reward in rewards
102 | ]
103 | })
104 | return reward_dataset
105 |
106 | return reward_func
107 | else:
108 | raise NotImplementedError("unsupported reward type \"{reward_type}\"")
109 |
110 |
111 | def main():
112 | # Parses arguments
113 | pipeline_name = "raft_aligner"
114 | PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
115 |
116 | parser = HfArgumentParser((
117 | ModelArguments,
118 | DatasetArguments,
119 | PipelineArguments,
120 | RewardArguments,
121 | ))
122 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
123 | model_args, data_args, pipeline_args, reward_args = parser.parse_json_file(
124 | json_file=os.path.abspath(sys.argv[1])
125 | )
126 | else:
127 | model_args, data_args, pipeline_args, reward_args = parser.parse_args_into_dataclasses()
128 |
129 | # Initializes pipeline, dataset and model for reward training
130 | aligner = AutoPipeline.get_pipeline(
131 | pipeline_name=pipeline_name,
132 | model_args=model_args,
133 | data_args=data_args,
134 | pipeline_args=pipeline_args,
135 | )
136 | dataset = Dataset(data_args)
137 | model = AutoModel.get_model(model_args)
138 |
139 | # Initializes reward function
140 | reward_function = get_reward_function(reward_args, pipeline_args)
141 |
142 | reward_model_args = ModelArguments(arch_type="text_regression")
143 | reward_model = AutoModel.get_model(reward_model_args)
144 | reward_model.register_inference_function(reward_function)
145 |
146 | # Aligns model with rewards
147 | aligned_model = aligner.align(
148 | model=model,
149 | dataset=dataset,
150 | reward_model=reward_model,
151 | )
152 |
153 |
154 | if __name__ == '__main__':
155 | main()
--------------------------------------------------------------------------------
/scripts/data_preprocess/concat_shuffle_split.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
4 | """
5 | This script is designed for handling large datasets.
6 | It merges multiple datasets located in the same directory, shuffles them, and splits them into training, evaluation, and testing sets.
7 | The training set is further divided into 10 folds.
8 | """
9 | from __future__ import absolute_import
10 |
11 | import argparse
12 | import json
13 | import textwrap
14 | import sys
15 | import os
16 | import random
17 | import gc
18 |
19 | def parse_argument(sys_argv):
20 | """Parses arguments from command line.
21 | Args:
22 | sys_argv: the list of arguments (strings) from command line.
23 | Returns:
24 | A struct whose member corresponds to the required (optional) variable.
25 | For example,
26 | ```
27 | args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
28 | args.input # 'a.txt'
29 | args.num # 10
30 | ```
31 | """
32 | parser = argparse.ArgumentParser(
33 | formatter_class=argparse.RawTextHelpFormatter)
34 |
35 | # Training parameters
36 | parser.add_argument(
37 | "--output_path", type=str,
38 | default=None,
39 | help=textwrap.dedent("output dataset path, writes to stdout by default")
40 | )
41 | parser.add_argument(
42 | "--merge_from_path", type=str,
43 | nargs="+",
44 | help=textwrap.dedent(
45 | "dataset path of the extra dataset that will be merged"
46 | " into input dataset"
47 | )
48 | )
49 | parser.add_argument(
50 | "--seed", type=int, default=42,
51 | help=textwrap.dedent("pseudorandom seed")
52 | )
53 | parser.add_argument(
54 | "--eval_size", type=int, default=200,
55 | help=textwrap.dedent("size of eval dataset")
56 | )
57 | parser.add_argument(
58 | "--test_size", type=int, default=1000,
59 | help=textwrap.dedent("size of test dataset")
60 | )
61 | parser.add_argument(
62 | "--k", type=int, default=10,
63 | help=textwrap.dedent("the train dataset will be divide into k folds")
64 | )
65 | # Parses from commandline
66 | args = parser.parse_args(sys_argv[1:])
67 |
68 | return args
69 |
70 |
71 | def main():
72 | args = parse_argument(sys.argv)
73 |
74 | # concat
75 | if args.merge_from_path is not None:
76 | for i in range(0, len(args.merge_from_path)):
77 | with open(args.merge_from_path[i], "r") as fin:
78 | extra_data_dict = json.load(fin)
79 | if i == 0:
80 | data_dict = extra_data_dict
81 | else:
82 | if data_dict["type"] != extra_data_dict["type"]:
83 | raise ValueError(
84 | 'two dataset have different types:'
85 | f' input dataset: "{data_dict["type"]}";'
86 | f' merge from dataset: "{extra_data_dict["type"]}"'
87 | )
88 | data_dict["instances"].extend(extra_data_dict["instances"])
89 | else:
90 | raise ValueError("No merge files specified")
91 | del extra_data_dict
92 | gc.collect()
93 | print('finish concat')
94 |
95 | # shuffle
96 | random.seed(args.seed)
97 | random.shuffle(data_dict["instances"])
98 | print('finish shuffle')
99 | # split to train, eval, test
100 | train_data_dict = {"type":data_dict["type"],"instances":data_dict["instances"][args.eval_size:-args.test_size]}
101 | eval_data_dict = {"type":data_dict["type"],"instances":data_dict["instances"][:args.eval_size]}
102 | test_data_dict = {"type":data_dict["type"],"instances":data_dict["instances"][-args.test_size:]}
103 | del data_dict
104 | gc.collect()
105 |
106 | # divide train in 10 folds
107 | num_instances = len(train_data_dict["instances"])
108 | split_size = num_instances // args.k
109 | split_data = []
110 | for i in range(args.k):
111 | if i < args.k-1:
112 | split = train_data_dict["instances"][i*split_size : (i+1)*split_size]
113 | else:
114 | # Last split may have remaining instances
115 | split = train_data_dict["instances"][i*split_size:]
116 | split_data.append({'type': train_data_dict["type"], 'instances': split})
117 |
118 | del train_data_dict
119 | gc.collect()
120 |
121 | print('finish split')
122 | # save dataset under output_path
123 |
124 | if args.output_path is None:
125 | args.output_path = sys.stdout
126 |
127 | train_save_path=os.path.join(args.output_path,"train_{k}_folds".format(k=args.k))
128 | if not os.path.exists(train_save_path):
129 | os.makedirs(train_save_path)
130 | for i in range(args.k):
131 | with open(train_save_path+"/train_"+str(i)+".json", 'w') as f:
132 | json.dump(split_data[i], f, indent=4, ensure_ascii=False)
133 |
134 | eval_save_path=os.path.join(args.output_path,"eval")
135 | if not os.path.exists(eval_save_path):
136 | os.makedirs(eval_save_path)
137 | with open(eval_save_path+'/eval.json','w') as f:
138 | json.dump(eval_data_dict,f,indent=4,ensure_ascii=False)
139 |
140 | test_save_path=os.path.join(args.output_path,"test")
141 | if not os.path.exists(test_save_path):
142 | os.makedirs(test_save_path)
143 | with open(test_save_path+'/test.json','w') as f:
144 | json.dump(test_data_dict,f,indent=4,ensure_ascii=False)
145 |
146 |
147 |
148 | if __name__ == "__main__":
149 | main()
150 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | LMFLow.
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series
86 | of actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or
93 | permanent ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 |
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 |
124 | [homepage]: https://www.contributor-covenant.org
125 |
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 |
--------------------------------------------------------------------------------