├── llmadmin
├── api
│ ├── __init__.py
│ ├── env.py
│ ├── sdk.py
│ └── cli.py
├── common
│ ├── __init__.py
│ ├── llm_event.py
│ ├── evaluation.py
│ └── backend.py
├── backend
│ ├── llm
│ │ ├── __init__.py
│ │ ├── ft
│ │ │ ├── const.py
│ │ │ ├── methods
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ └── lora.py
│ │ │ ├── __init__.py
│ │ │ ├── utils.py
│ │ │ ├── tasks
│ │ │ │ ├── __init__.py
│ │ │ │ ├── sequenceclassification_glue_cola.py
│ │ │ │ ├── sequenceclassification_glue_mrpc.py
│ │ │ │ ├── sequenceclassification_yelp_review_full.py
│ │ │ │ ├── _base.py
│ │ │ │ ├── maskedlm_imdb.py
│ │ │ │ ├── tokenclassification_conll2003.py
│ │ │ │ ├── noheader_AdvertiseGen.py
│ │ │ │ └── text_generation_AdvertiseGen.py
│ │ │ ├── callback.py
│ │ │ ├── _base.py
│ │ │ ├── test
│ │ │ │ └── test_seq_cls_bert_yelp.py
│ │ │ ├── ray_train.py
│ │ │ └── transformer.py
│ │ ├── pipelines
│ │ │ ├── llamacpp
│ │ │ │ ├── __init__.py
│ │ │ │ ├── processors.py
│ │ │ │ └── llamacpp_pipeline.py
│ │ │ ├── __init__.py
│ │ │ ├── processors.py
│ │ │ ├── utils.py
│ │ │ ├── default_pipeline.py
│ │ │ └── default_transformers_pipeline.py
│ │ ├── initializers
│ │ │ ├── hf_transformers
│ │ │ │ ├── __init__.py
│ │ │ │ └── deepspeed.py
│ │ │ ├── __init__.py
│ │ │ ├── _base.py
│ │ │ └── llamacpp.py
│ │ └── utils.py
│ ├── server
│ │ ├── __init__.py
│ │ ├── exceptions.py
│ │ ├── _batch.py
│ │ ├── run.py
│ │ └── config.py
│ └── logger.py
├── frontend
│ ├── __init__.py
│ ├── mongo_secrets.py
│ ├── javascript_loader.py
│ ├── app.py
│ ├── javascript
│ │ └── llmadmin.js
│ ├── utils.py
│ ├── mongo_logger.py
│ └── leaderboard.py
└── __init__.py
├── docs
└── llm-finetune.png
├── MANIFEST.in
├── pyproject.toml
├── dataset
└── glue
│ └── mrpc
│ └── 1.0.0
│ ├── test-00000-of-00001.parquet
│ ├── train-00000-of-00001.parquet
│ └── validation-00000-of-00001.parquet
├── llm_finetune.py
├── requirements.txt
├── models
├── ft--sequenceclassification--bert-base-uncased.yaml
├── ft--text-generation--Qwen-Qwen-7B-Chat.yaml
├── ft--text-generation--THUDM-chatglm2-6b.yaml
├── ft--text-generation--Qwen-Qwen-7B.yaml
├── ft--sequenceclassification--bert-base-uncased-lora.yaml
├── ft--maskedlm--distilbert-base-uncased.yaml
└── ft--text-generation--THUDM-chatglm3-6b.yaml
├── setup.py
├── llm_finetune_ray.py
├── .gitignore
├── README.md
└── LICENSE
/llmadmin/api/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmadmin/common/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmadmin/frontend/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmadmin/backend/server/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmadmin/__init__.py:
--------------------------------------------------------------------------------
1 | from llmadmin.api.sdk import *
2 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/const.py:
--------------------------------------------------------------------------------
1 | CHECKPOINT_PATH = "./fintuned/"
--------------------------------------------------------------------------------
/llmadmin/backend/server/exceptions.py:
--------------------------------------------------------------------------------
1 | class PromptTooLongError(ValueError):
2 | pass
3 |
--------------------------------------------------------------------------------
/docs/llm-finetune.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/docs/llm-finetune.png
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE *.sh
2 | recursive-include tests *.py
3 | recursive-include models *.yaml
4 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff]
2 | select = ["E", "F", "I", "ASYNC", "B"]
3 | line-length = 300
4 | ignore = ["F403", "B905"]
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/methods/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import get_train_model
2 |
3 | __all__ = [
4 | "get_train_model"
5 | ]
6 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/llamacpp/__init__.py:
--------------------------------------------------------------------------------
1 | from .llamacpp_pipeline import LlamaCppPipeline
2 |
3 | __all__ = ["LlamaCppPipeline"]
--------------------------------------------------------------------------------
/dataset/glue/mrpc/1.0.0/test-00000-of-00001.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/dataset/glue/mrpc/1.0.0/test-00000-of-00001.parquet
--------------------------------------------------------------------------------
/dataset/glue/mrpc/1.0.0/train-00000-of-00001.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/dataset/glue/mrpc/1.0.0/train-00000-of-00001.parquet
--------------------------------------------------------------------------------
/dataset/glue/mrpc/1.0.0/validation-00000-of-00001.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/dataset/glue/mrpc/1.0.0/validation-00000-of-00001.parquet
--------------------------------------------------------------------------------
/llm_finetune.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 | from llmadmin.api.cli import app
4 | if __name__ == '__main__':
5 | sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
6 | sys.exit(app())
7 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Type
2 |
3 | from ._base import BaseFT
4 | from .transformer import TransformersFT
5 | from .ray_train import RayTrain
6 |
7 |
8 | __all__ = [
9 | "TransformersFT", "RayTrain"
10 | ]
11 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/methods/base.py:
--------------------------------------------------------------------------------
1 | from .lora import lora_model
2 | from llmadmin.backend.logger import get_logger
3 |
4 | logger = get_logger(__name__)
5 |
6 | def get_train_model(model, ft_method, trainConfig):
7 | if ft_method == "lora":
8 | lora_config = trainConfig.lora_config
9 | model = lora_model(model, lora_config)
10 | return model
11 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/utils.py:
--------------------------------------------------------------------------------
1 | from llmadmin.backend.server.models import FTApp
2 |
3 | def parse_task_name(ftapp: FTApp):
4 | task_purpose = (ftapp.ft_config.ft_task + "-") if ftapp.ft_config.ft_task else ""
5 | data_path = ftapp.ft_config.data_config.data_path
6 | data_name = ("-" + ftapp.ft_config.data_config.subset) if ftapp.ft_config.data_config.subset else ""
7 |
8 | return task_purpose + data_path + data_name
9 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/initializers/hf_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import DeviceMapInitializer, SingleDeviceInitializer, TransformersInitializer, FinetuneInitializer, AutoModelInitializer, TransformersPipelineInitializer
2 |
3 | __all__ = [
4 | "DeviceMapInitializer",
5 | "SingleDeviceInitializer",
6 | "TransformersInitializer",
7 | "FinetuneInitializer",
8 | "TransformersPipelineInitializer",
9 | "AutoModelInitializer",
10 | ]
11 |
--------------------------------------------------------------------------------
/llmadmin/backend/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from typing import Optional
4 |
5 | LOG_FORMAT = (
6 | "[%(levelname)s %(asctime)s]{rank} %(filename)s: %(lineno)d " "%(message)s"
7 | )
8 |
9 |
10 | def get_logger(name: str = None, rank: Optional[int] = None, **kwargs):
11 | if rank is None:
12 | rank = int(os.environ.get("RANK", -1))
13 | logger = logging.getLogger(name)
14 | level = logging.ERROR if rank > 0 else logging.INFO
15 | log_format = LOG_FORMAT.format(rank=f"[Rank {rank}]" if rank > -1 else "")
16 | logging.basicConfig(level=level, format=log_format, **kwargs)
17 | return logger
18 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | async_timeout==4.0.3
2 | boto3==1.34.54
3 | datasets==2.18.0
4 | evaluate==0.4.1
5 | fastapi==0.100.1
6 | filelock==3.13.1
7 | gradio==3.39.0
8 | huggingface_hub==0.21.3
9 | jieba==0.42.1
10 | mdit_py_plugins==0.3.3
11 | nltk==3.8.1
12 | numpy==1.26.4
13 | optimum==1.17.1
14 | pandas==2.2.1
15 | peft==0.9.0
16 | pydantic==1.10.9
17 | pymongo==4.6.2
18 | PyYAML==6.0.1
19 | Requests==2.31.0
20 | rich==13.7.1
21 | rouge_chinese==1.0.3
22 | torch==2.1.2
23 | transformers==4.33.0
24 | typer==0.9.0
25 | typing_extensions==4.10.0
26 | socksio==1.0.0
27 | scipy==1.11.1
28 | einops
29 | transformers_stream_generator
30 | tiktoken
31 | cpm_kernels
32 | ray[serve]==2.20.0
33 | ray[train]==2.20.0
34 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Type
2 |
3 | from ._base import BasePipeline
4 | from .default_pipeline import DefaultPipeline
5 | from .default_transformers_pipeline import DefaultTransformersPipeline
6 | from .llamacpp import LlamaCppPipeline
7 |
8 |
9 | def get_pipeline_cls_by_name(name: str) -> Type[BasePipeline]:
10 | lowercase_globals = {k.lower(): v for k, v in globals().items()}
11 | ret = lowercase_globals.get(
12 | f"{name.lower()}pipeline", lowercase_globals.get(name.lower(), None)
13 | )
14 | assert ret
15 | return ret
16 |
17 |
18 | __all__ = [
19 | "get_pipeline_cls_by_name",
20 | "DefaultPipeline",
21 | "DefaultTransformersPipeline",
22 | "LlamaCppPipeline",
23 | ]
24 |
--------------------------------------------------------------------------------
/llmadmin/api/env.py:
--------------------------------------------------------------------------------
1 | def has_ray():
2 | try:
3 | import ray # noqa: F401
4 |
5 | return True
6 | except ImportError:
7 | return False
8 |
9 |
10 | def has_backend():
11 | try:
12 | import llmadmin.backend # noqa: F401
13 |
14 | return True
15 | except ImportError:
16 | return True
17 |
18 |
19 | def assert_has_ray():
20 | assert has_ray(), (
21 | "This command requires ray to be installed. "
22 | "Please install ray with `pip install 'ray[default]'`"
23 | )
24 |
25 |
26 | def assert_has_backend():
27 | assert has_backend(), (
28 | "This command requires llmadmin backend to be installed. "
29 | "Please install backend dependencies with `pip install llmadmin[backend]`. "
30 | )
31 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/llamacpp/processors.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import torch
4 | from llama_cpp import LogitsProcessor, StoppingCriteria
5 | from transformers import MaxTimeCriteria, MinNewTokensLengthLogitsProcessor
6 |
7 | from llmadmin.backend.logger import get_logger
8 |
9 | logger = get_logger(__name__)
10 |
11 |
12 | class LlamaCppMinNewTokensLengthLogitsProcessor(
13 | MinNewTokensLengthLogitsProcessor, LogitsProcessor
14 | ):
15 | def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]:
16 | scores = MinNewTokensLengthLogitsProcessor.__call__(
17 | self, torch.LongTensor(input_ids), torch.FloatTensor(scores)[None, :]
18 | )
19 | return scores[0].tolist()
20 |
21 |
22 | class LlamaMaxTimeCriteria(MaxTimeCriteria, StoppingCriteria):
23 | pass
--------------------------------------------------------------------------------
/llmadmin/backend/llm/initializers/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING, Type
2 |
3 | from .hf_transformers import (
4 | DeviceMapInitializer,
5 | SingleDeviceInitializer,
6 | FinetuneInitializer,
7 | TransformersPipelineInitializer,
8 | AutoModelInitializer,
9 | )
10 |
11 | if TYPE_CHECKING:
12 | from ._base import LLMInitializer
13 |
14 | from .llamacpp import LlamaCppInitializer
15 |
16 |
17 | def get_initializer_cls_by_name(name: str) -> Type["LLMInitializer"]:
18 | lowercase_globals = {k.lower(): v for k, v in globals().items()}
19 | ret = lowercase_globals.get(
20 | f"{name.lower()}initializer", lowercase_globals.get(name.lower(), None)
21 | )
22 | assert ret
23 | return ret
24 |
25 |
26 | __all__ = [
27 | "get_initializer_cls_by_name",
28 | "DeviceMapInitializer",
29 | "SingleDeviceInitializer",
30 | "FinetuneInitializer",
31 | "AutoModelInitializer",
32 | "LlamaCppInitializer",
33 | "TransformersPipelineInitializer",
34 | ]
35 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | from . import sequenceclassification_glue_cola
2 | from . import sequenceclassification_glue_mrpc
3 | from . import tokenclassification_conll2003
4 | from . import noheader_AdvertiseGen
5 | from . import text_generation_AdvertiseGen
6 | from . import maskedlm_imdb
7 | from . import sequenceclassification_yelp_review_full
8 |
9 | TASK_REGISTRY = {
10 | "sequenceclassification-glue-cola": sequenceclassification_glue_cola.SequenceclassificationGlueCola,
11 | "sequenceclassification-glue-mrpc": sequenceclassification_glue_mrpc.SequenceclassificationGlueMrpc,
12 | "tokenclassification-conll2003": tokenclassification_conll2003.TokenclassificationConll2003,
13 | "noheader-AdvertiseGen": noheader_AdvertiseGen.NoheaderAdvertiseGen,
14 | "text-generation-AdvertiseGen": text_generation_AdvertiseGen.NoheaderAdvertiseGen,
15 | "maskedlm-imdb": maskedlm_imdb.MaskedLMImdb,
16 | "sequenceclassification-yelp_review_full": sequenceclassification_yelp_review_full.SequenceclassificationYelpReviewFull
17 | }
--------------------------------------------------------------------------------
/models/ft--sequenceclassification--bert-base-uncased.yaml:
--------------------------------------------------------------------------------
1 | model_config:
2 | warmup: True
3 | model_task: fill-mask
4 | model_id: bert-base-uncased
5 | max_input_words: 800
6 | initialization:
7 | initializer:
8 | type: Finetune
9 | dtype: float32
10 | from_pretrained_kwargs:
11 | trust_remote_code: true
12 | ft_config:
13 | ft_task: "sequenceclassification"
14 | data_config:
15 | data_path: glue
16 | subset: mrpc
17 | local_path: dataset/glue/mrpc/1.0.0
18 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data
19 | # train_file:
20 | # validation_file:
21 | input_columns:
22 | - "sentence"
23 | validation_column: validation
24 | # labels
25 | train_config:
26 | base_config:
27 | checkpoints_output_dir: finetune_models/
28 | per_device_train_batch_size: 8
29 | learning_rate: 2e-5
30 | num_train_epochs: 2
31 | weight_decay: 0.01
32 | logging_strategy: steps
33 | evaluation_strategy: steps
34 | save_strategy: steps
35 | save_steps: 100
36 |
--------------------------------------------------------------------------------
/models/ft--text-generation--Qwen-Qwen-7B-Chat.yaml:
--------------------------------------------------------------------------------
1 | model_config:
2 | warmup: True
3 | model_task: text-generation
4 | model_id: Qwen/Qwen-7B-Chat
5 | max_input_words: 800
6 | initialization:
7 | initializer:
8 | type: Finetune
9 | dtype: float32
10 | from_pretrained_kwargs:
11 | trust_remote_code: true
12 | ft_config:
13 | ft_task: "text-generation"
14 | data_config:
15 | data_path: AdvertiseGen
16 | local_path: dataset/AdvertiseGen
17 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data
18 | input_columns:
19 | - "content"
20 | validation_column: summary
21 | train_config:
22 | base_config:
23 | max_length: 500
24 | checkpoints_output_dir: /tmp/finetune
25 | per_device_train_batch_size: 1
26 | per_device_eval_batch_size: 1
27 | learning_rate: 2e-5
28 | num_train_epochs: 2
29 | weight_decay: 0.01
30 | remove_unused_columns: true
31 | logging_strategy: steps
32 | evaluation_strategy: steps
33 | save_strategy: steps
34 | save_steps: 25
35 | max_steps: 50
36 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/callback.py:
--------------------------------------------------------------------------------
1 | from transformers import TrainerCallback, TrainerState, TrainerControl, TrainingArguments
2 | import threading
3 | import queue
4 |
5 | QUEUE = queue.Queue()
6 | def send_metrics():
7 | while True:
8 | item = QUEUE.get()
9 | print("============")
10 | print(item)
11 | QUEUE.task_done()
12 |
13 | threading.Thread(target=send_metrics, daemon=True).start()
14 |
15 | class CustomCallback(TrainerCallback):
16 | """
17 | Overriding the trainer callback to be able to compute training accuracy as well
18 | Example taken from:
19 | https://stackoverflow.com/questions/67457480/how-to-get-the-accuracy-per-epoch-or-step-for-the-huggingface-transformers-train
20 | """
21 | METRICS_FILE = "./metrics"
22 |
23 | def __init__(self, trainer) -> None:
24 | super().__init__()
25 | self._trainer = trainer
26 |
27 | def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
28 | if control.should_log:
29 | if len(state.log_history) != 0:
30 | QUEUE.put(state.log_history[-1])
31 | return control
--------------------------------------------------------------------------------
/models/ft--text-generation--THUDM-chatglm2-6b.yaml:
--------------------------------------------------------------------------------
1 | model_config:
2 | warmup: True
3 | model_task: text-generation
4 | model_id: THUDM/chatglm2-6b
5 | max_input_words: 800
6 | quantization_bit: 4
7 | initialization:
8 | initializer:
9 | type: Finetune
10 | dtype: float32
11 | from_pretrained_kwargs:
12 | trust_remote_code: true
13 | # load_in_8bit: True
14 | ft_config:
15 | ft_task: "text-generation"
16 | data_config:
17 | data_path: AdvertiseGen
18 | local_path: dataset/AdvertiseGen
19 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data
20 | input_columns:
21 | - "content"
22 | validation_column: summary
23 | train_config:
24 | base_config:
25 | max_length: 500
26 | checkpoints_output_dir: /tmp/finetune
27 | per_device_train_batch_size: 1
28 | per_device_eval_batch_size: 1
29 | learning_rate: 2e-5
30 | num_train_epochs: 2
31 | weight_decay: 0.01
32 | remove_unused_columns: true
33 | logging_strategy: steps
34 | evaluation_strategy: steps
35 | save_strategy: steps
36 | save_steps: 25
37 | max_steps: 50
38 |
--------------------------------------------------------------------------------
/models/ft--text-generation--Qwen-Qwen-7B.yaml:
--------------------------------------------------------------------------------
1 | model_config:
2 | warmup: True
3 | model_task: text-generation
4 | model_id: Qwen/Qwen-7B
5 | max_input_words: 800
6 | initialization:
7 | initializer:
8 | type: Finetune
9 | dtype: float32
10 | from_pretrained_kwargs:
11 | trust_remote_code: true
12 | add_special_tokens:
13 | pad_token: "<|extra_0|>"
14 | eos_token: "<|endoftext|>"
15 | ft_config:
16 | ft_task: "text-generation"
17 | data_config:
18 | data_path: AdvertiseGen
19 | local_path: dataset/AdvertiseGen
20 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data
21 | input_columns:
22 | - "content"
23 | validation_column: summary
24 | train_config:
25 | base_config:
26 | max_length: 500
27 | checkpoints_output_dir: /tmp/finetune
28 | per_device_train_batch_size: 1
29 | per_device_eval_batch_size: 1
30 | learning_rate: 2e-5
31 | num_train_epochs: 2
32 | weight_decay: 0.01
33 | remove_unused_columns: true
34 | logging_strategy: steps
35 | evaluation_strategy: steps
36 | save_strategy: steps
37 | save_steps: 25
38 | max_steps: 50
39 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/methods/lora.py:
--------------------------------------------------------------------------------
1 | from peft import get_peft_model
2 | from llmadmin.backend.logger import get_logger
3 |
4 | logger = get_logger(__name__)
5 |
6 | def get_trainable_parameters(model):
7 | """
8 | get the number of trainable parameters in the model.
9 | """
10 | trainable_params = 0
11 | all_param = 0
12 | for _, param in model.named_parameters():
13 | all_param += param.numel()
14 | if param.requires_grad:
15 | trainable_params += param.numel()
16 | logger.info(
17 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
18 | )
19 |
20 | def lora_model(model, lora_config):
21 | logger.info("Load lora config")
22 | logger.info(lora_config)
23 | # from peft import LoraConfig, TaskType
24 | # lora_config = LoraConfig(
25 | # task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
26 | # )
27 | # logger.info(lora_config)
28 | lora_config.loftq_config = {}
29 | logger.info("Using peft to avoid Catastrophic Forgetting")
30 | model = get_peft_model(model, lora_config)
31 | get_trainable_parameters(model)
32 | return model
33 |
--------------------------------------------------------------------------------
/llmadmin/frontend/mongo_secrets.py:
--------------------------------------------------------------------------------
1 | # Use this code snippet in your app.
2 | # If you need more information about configurations
3 | # or implementing the sample code, visit the AWS docs:
4 | # https://aws.amazon.com/developer/language/python/
5 |
6 | import json
7 | import logging
8 | import os
9 |
10 | import boto3
11 |
12 |
13 | def get_mongo_secret_url():
14 | mongo_url = os.getenv("MONGODB_URL")
15 | if mongo_url:
16 | return mongo_url
17 | try:
18 | secret_name = "prod/frontend/mongo_password"
19 | region_name = "us-west-2"
20 |
21 | # Create a Secrets Manager client
22 | session = boto3.session.Session()
23 | client = session.client(service_name="secretsmanager", region_name=region_name)
24 |
25 | get_secret_value_response = client.get_secret_value(SecretId=secret_name)
26 |
27 | # Decrypts secret using the associated KMS key.
28 | secret = get_secret_value_response["SecretString"]
29 |
30 | secret_dict = json.loads(secret)
31 | mongo_url = secret_dict.get("url")
32 | return mongo_url
33 | except Exception as e:
34 | # Fail quietly if we can't get the secret
35 | logging.warning(f"Failed to retrieve mongo secret, Exception: {e}")
36 |
--------------------------------------------------------------------------------
/llmadmin/common/llm_event.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from enum import Enum
3 | from typing import Dict, List, Optional
4 |
5 | from pydantic import BaseModel
6 |
7 |
8 | class Flag(Enum):
9 | HATE = "hate"
10 | OBSCENE = "obscene"
11 | WRONG_LANGUAGE = "wrong-language"
12 | NONFACTUAL = "non-factual"
13 |
14 |
15 | class Vote(BaseModel):
16 | llm: str
17 | score: float
18 |
19 |
20 | class LlmResponse(BaseModel):
21 | model_id: str
22 | text: str
23 | model_config: Optional[Dict]
24 | gen_stats: Optional[Dict]
25 |
26 |
27 | class LlmEvent(BaseModel):
28 | created_at: datetime
29 | # Name of the project
30 | project_name: str
31 |
32 | # Identifier for a session
33 | session_id: Optional[str]
34 |
35 | # unique string representing this event
36 | instance_id: str
37 |
38 | # Prompt given by the user
39 | user_prompt: str
40 | responses: List[LlmResponse]
41 |
42 | # Vote is a dictionary by llm and the votes
43 | # that model got. Typically, this is 1.
44 | votes: Optional[List[Vote]]
45 | vote_comments: Optional[Dict[str, str]]
46 |
47 | # Key: llm
48 | # Value: list of flags
49 | flag: Optional[Dict[str, List[Flag]]]
50 |
51 | # Key: llm
52 | # Value: Comment for each llm
53 | flag_comments: Optional[Dict[str, str]]
54 |
--------------------------------------------------------------------------------
/models/ft--sequenceclassification--bert-base-uncased-lora.yaml:
--------------------------------------------------------------------------------
1 | model_config:
2 | warmup: True
3 | model_task: fill-mask
4 | model_id: bert-base-uncased
5 | initialization:
6 | initializer:
7 | type: Finetune
8 | dtype: float32
9 | from_pretrained_kwargs:
10 | trust_remote_code: true
11 | ft_config:
12 | # ft_stage: "sft"
13 | ft_method: "lora"
14 | ft_task: "sequenceclassification"
15 | data_config:
16 | data_path: glue
17 | subset: mrpc
18 | local_path: dataset/glue/mrpc/1.0.0
19 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data
20 | input_columns:
21 | - "sentence"
22 | validation_column: validation
23 | train_config:
24 | lora_config:
25 | r: 1 # Lora attention dimension
26 | task_type: SEQ_CLS #SEQ_CLS, SEQ_2_SEQ_LM, CAUSAL_LM, TOKEN_CLS, QUESTION_ANS, FEATURE_EXTRACTION
27 | lora_alpha: 1 # The alpha parameter for Lora scaling
28 | lora_dropout: 0.1 # The dropout probability for Lora layers
29 | base_config:
30 | checkpoints_output_dir: finetune_models/
31 | per_device_train_batch_size: 8
32 | learning_rate: 2e-5
33 | num_train_epochs: 2
34 | weight_decay: 0.01
35 | logging_strategy: steps
36 | evaluation_strategy: steps
37 | save_strategy: steps
38 | save_steps: 100
39 |
--------------------------------------------------------------------------------
/models/ft--maskedlm--distilbert-base-uncased.yaml:
--------------------------------------------------------------------------------
1 | model_config:
2 | warmup: True
3 | model_task: fill-mask
4 | model_id: distilbert-base-uncased
5 | max_input_words: 800
6 | initialization:
7 | runtime_env:
8 | pip:
9 | - deepspeed==0.9.2
10 | - accelerate
11 | s3_mirror_config:
12 | bucket_uri: /tmp/hub/models/distilbert-base-uncased/
13 | # bucket_uri: s3://large-dl-models-mirror/models--amazon--LightGPT/main-safetensors/
14 | initializer:
15 | type: Finetune
16 | dtype: float32
17 | from_pretrained_kwargs:
18 | # use_cache: true
19 | trust_remote_code: true
20 | # use_kernel: true # for deepspped type only
21 | # max_tokens: 1536 # for deepspped type only
22 | ft_config:
23 | ft_task: maskedlm
24 | data_config:
25 | data_path: imdb
26 | subset:
27 | local_path: /tmp/hub/dataset/imdb/plain_text/1.0.0
28 | num_row: 30
29 | # train_file:
30 | # validation_file:
31 | input_columns:
32 | - "sentence"
33 | validation_column: validation
34 | # labels
35 | train_config:
36 | base_config:
37 | checkpoints_output_dir: /tmp/finetune
38 | per_device_train_batch_size: 32
39 | learning_rate: 2e-5
40 | num_train_epochs: 2
41 | weight_decay: 0.01
42 | remove_unused_columns: false
43 | logging_strategy: steps
44 | evaluation_strategy: steps
45 | save_strategy: steps
46 | save_steps: 100
47 | scaling_config:
48 | num_workers: 7
49 | num_gpus_per_worker: 0
50 | num_cpus_per_worker: 1 # for infrence
51 | # resources_per_worker:
52 | # accelerator_type_cpu: 0.01
53 | ray_actor_options:
54 | num_cpus: 0.1
55 |
--------------------------------------------------------------------------------
/models/ft--text-generation--THUDM-chatglm3-6b.yaml:
--------------------------------------------------------------------------------
1 | model_config:
2 | warmup: True
3 | model_task: text-generation
4 | model_id: THUDM/chatglm3-6b
5 | max_input_words: 800
6 | quantization_bit: 4
7 | initialization:
8 | # s3_mirror_config:
9 | # endpoint_url: http://39.107.108.170:9000 # Optinal for custom S3 storage endpoint url
10 | # bucket_uri: s3://opt-125m/facemodel/ # Must include hash file with commit id in repo
11 | # bucket_uri: /root/.cache/hub/ZhipuAI/chatglm3-6b/ # Local path of model with hash file
12 | initializer:
13 | type: Finetune
14 | dtype: float32
15 | from_pretrained_kwargs:
16 | trust_remote_code: true
17 | ft_config:
18 | ft_task: "text-generation"
19 | ft_method: "lora"
20 | data_config:
21 | data_path: AdvertiseGen
22 | local_path: dataset/AdvertiseGen
23 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data
24 | input_columns:
25 | - "content"
26 | validation_column: summary
27 | train_config:
28 | lora_config:
29 | r: 1 # Lora attention dimension
30 | task_type: CAUSAL_LM #SEQ_CLS, SEQ_2_SEQ_LM, CAUSAL_LM, TOKEN_CLS, QUESTION_ANS, FEATURE_EXTRACTION
31 | lora_alpha: 1 # The alpha parameter for Lora scaling
32 | lora_dropout: 0.1 # The dropout probability for Lora layers
33 | base_config:
34 | max_length: 500
35 | checkpoints_output_dir: /tmp/finetune
36 | per_device_train_batch_size: 1
37 | per_device_eval_batch_size: 1
38 | learning_rate: 2e-5
39 | num_train_epochs: 2
40 | weight_decay: 0.01
41 | remove_unused_columns: true
42 | logging_strategy: steps
43 | evaluation_strategy: steps
44 | save_strategy: steps
45 | save_steps: 25
46 | max_steps: 50
47 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | from setuptools import find_packages, setup
3 | this_directory = os.path.abspath(os.path.dirname(__file__))
4 | with open(os.path.join(this_directory, "requirements.txt"), encoding="utf-8") as f:
5 | INSTALL_REQUIRES = f.read().splitlines()
6 |
7 | EXTRAS_REQUIRE = {
8 | "dev": INSTALL_REQUIRES + [
9 | "pre-commit",
10 | "ruff==0.0.270",
11 | "black==23.3.0",
12 | ],
13 | "test": INSTALL_REQUIRES + [
14 | "pytest",
15 | ],
16 | "docs": INSTALL_REQUIRES + [
17 | "mkdocs-material",
18 | ],
19 | }
20 |
21 | setup(
22 | name="llmfinetune",
23 | version="0.0.1",
24 | description="A framework to finetune LLMs",
25 | long_description=open("README.md", "r", encoding="utf-8").read(),
26 | long_description_content_type="text/markdown",
27 | packages=find_packages(include="llmadmin*"),
28 | keywords=["ChatGLM", "BaiChuan", "LLaMA", "BLOOM", "Falcon",
29 | "LLM", "ChatGPT", "transformer", "pytorch", "deep learning"],
30 | include_package_data=True,
31 | package_data={"llmadmin": ["models/*"]},
32 | entry_points={
33 | "console_scripts": [
34 | "llmfinetune=llmadmin.api.cli:app",
35 | ]
36 | },
37 | extras_require=EXTRAS_REQUIRE,
38 | install_requires=INSTALL_REQUIRES,
39 | python_requires=">=3.8",
40 | classifiers=[
41 | "Development Status :: 3 - Alpha",
42 | "Intended Audience :: Developers",
43 | "Intended Audience :: Education",
44 | "Intended Audience :: Science/Research",
45 | "License :: OSI Approved :: Apache Software License",
46 | "Operating System :: OS Independent",
47 | "Programming Language :: Python :: 3.8",
48 | "Programming Language :: Python :: 3.9",
49 | "Programming Language :: Python :: 3.10",
50 | "Programming Language :: Python :: 3.11",
51 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
52 | ]
53 | )
54 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/_base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from llmadmin.backend.logger import get_logger
3 | from datasets import DatasetDict, Dataset, IterableDatasetDict, IterableDataset
4 | from typing import Union, TYPE_CHECKING, List
5 | from transformers import PreTrainedModel, PreTrainedTokenizer
6 | from llmadmin.backend.server.models import FTApp
7 | import torch
8 | from llmadmin.backend.llm.initializers import get_initializer_cls_by_name
9 |
10 | if TYPE_CHECKING:
11 | from ..initializers._base import LLMInitializer
12 |
13 | logger = get_logger(__name__)
14 |
15 | class BaseFT(ABC):
16 | """base fine tune class.
17 |
18 | Args:
19 | """
20 |
21 | def __init__(
22 | self,
23 | ftapp: FTApp,
24 | ) -> None:
25 | self.ftapp = ftapp
26 | self.data_conf = ftapp.ft_config.data_config
27 | self.train_conf = ftapp.ft_config.train_config.base_config
28 | self.model_config = ftapp.model_config
29 | self.ft_task = ftapp.ft_config.ft_task
30 | self.scale_config = ftapp.scaling_config
31 |
32 | # Lazy import so that the new cache location is used
33 | torch.backends.cuda.matmul.allow_tf32 = True
34 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
35 |
36 | initializer_name = self.model_config.initialization.initializer
37 | if not isinstance(initializer_name, str):
38 | initializer_name = initializer_name.type
39 |
40 | logger.info(f"Finetune initializer name '{initializer_name}' on device {device}")
41 | initializer = get_initializer_cls_by_name(initializer_name)(
42 | device=device,
43 | world_size=1, # fake
44 | **self.model_config.initialization.initializer.get_initializer_kwargs(),
45 | )
46 |
47 | self.initializer = initializer
48 |
49 | @abstractmethod
50 | def train(self):
51 | pass
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/test/test_seq_cls_bert_yelp.py:
--------------------------------------------------------------------------------
1 | # Adapted from Hugging Face tutorial: https://huggingface.co/docs/transformers/training
2 |
3 | import numpy as np
4 | import evaluate
5 | from datasets import load_dataset
6 | from transformers import (
7 | Trainer,
8 | TrainingArguments,
9 | AutoTokenizer,
10 | AutoModelForSequenceClassification,
11 | )
12 |
13 | num_labels = 5
14 | modelPath = "bert-base-cased"
15 | modelPath = "/Users/hub/models/bert-base-cased"
16 | dsPath = "yelp_review_full"
17 | dsPath = "/Users/hub/models/yelp_review_full/1.0.0"
18 |
19 | # Datasets
20 | dataset = load_dataset(dsPath)
21 | print('Loaded dataset', dataset)
22 |
23 | tokenizer = AutoTokenizer.from_pretrained(modelPath)
24 |
25 | def tokenize_function(examples):
26 | return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
27 |
28 | count = 10
29 | small_train_dataset = dataset["train"].select(range(count)).map(tokenize_function, batched=True)
30 | small_eval_dataset = dataset["test"].select(range(count)).map(tokenize_function, batched=True)
31 | print('small train dataset', small_train_dataset)
32 | print('small eval dataset', small_eval_dataset)
33 |
34 | # Model
35 | model = AutoModelForSequenceClassification.from_pretrained(modelPath, num_labels=num_labels)
36 |
37 | # Metrics
38 | metric = evaluate.load("accuracy")
39 |
40 | def compute_metrics(eval_pred):
41 | logits, labels = eval_pred
42 | predictions = np.argmax(logits, axis=-1)
43 | return metric.compute(predictions=predictions, references=labels)
44 |
45 | # Hugging Face Trainer
46 | training_args = TrainingArguments(
47 | output_dir="test_trainer",
48 | evaluation_strategy="epoch",
49 | report_to="none"
50 | )
51 |
52 | trainer = Trainer(
53 | model=model,
54 | args=training_args,
55 | train_dataset=small_train_dataset,
56 | eval_dataset=small_eval_dataset,
57 | compute_metrics=compute_metrics,
58 | )
59 |
60 | # Start Training
61 | trainer.train()
62 |
--------------------------------------------------------------------------------
/llmadmin/frontend/javascript_loader.py:
--------------------------------------------------------------------------------
1 | # https://github.com/gradio-app/gradio/discussions/2932
2 | import mimetypes
3 | import os
4 |
5 | import gradio.routes
6 |
7 | mimetypes.init()
8 | mimetypes.add_type("application/javascript", ".js")
9 |
10 |
11 | class ScriptLoader:
12 | path_map = {
13 | "js": os.path.abspath(os.path.join(os.path.dirname(__file__), "javascript")),
14 | "py": os.path.abspath(os.path.join(os.path.dirname(__file__), "python")),
15 | }
16 |
17 | def __init__(self, script_type):
18 | self.script_type = script_type
19 | self.path = ScriptLoader.path_map[script_type]
20 | self.loaded_scripts = []
21 |
22 | @staticmethod
23 | def get_scripts(path: str, file_type: str) -> list[tuple[str, str]]:
24 | scripts = []
25 | dir_list = [os.path.join(path, f) for f in os.listdir(path)]
26 | files_list = [f for f in dir_list if os.path.isfile(f)]
27 | for s in files_list:
28 | # Dont forget the "." for file extension
29 | if os.path.splitext(s)[1] == f".{file_type}":
30 | scripts.append((s, os.path.basename(s)))
31 | return scripts
32 |
33 |
34 | class JavaScriptLoader(ScriptLoader):
35 | def __init__(self):
36 | super().__init__("js")
37 | self.original_template = gradio.routes.templates.TemplateResponse
38 | self.load_js()
39 | gradio.routes.templates.TemplateResponse = self.template_response
40 |
41 | def load_js(self):
42 | js_scripts = ScriptLoader.get_scripts(self.path, self.script_type)
43 | for file_path, file_name in js_scripts:
44 | with open(file_path, "r", encoding="utf-8") as file:
45 | self.loaded_scripts.append(
46 | f"\n\n"
47 | )
48 |
49 | def template_response(self, *args, **kwargs):
50 | response = self.original_template(*args, **kwargs)
51 | response.body = response.body.replace(
52 | "".encode("utf-8"),
53 | f"{''.join(self.loaded_scripts)}\n".encode("utf-8"),
54 | )
55 | response.init_headers()
56 | return response
57 |
--------------------------------------------------------------------------------
/llmadmin/frontend/app.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import random
3 | import re
4 | import uuid
5 | from typing import Any, Dict, List
6 | import ray
7 | import requests
8 |
9 | from llmadmin.common.backend import get_llmadmin_backend
10 | from llmadmin.common.constants import (
11 | AVIARY_DESC,
12 | CSS,
13 | EXAMPLES_IF,
14 | EXAMPLES_QA,
15 | EXAMPLES_ST,
16 | HEADER,
17 | LOGO_ANYSCALE,
18 | LOGO_GITHUB,
19 | LOGO_RAY,
20 | LOGO_RAY_TYPEFACE,
21 | MODEL_DESCRIPTION_FORMAT,
22 | MODEL_DESCRIPTIONS_HEADER,
23 | MODELS,
24 | NUM_LLM_OPTIONS,
25 | PROJECT_NAME,
26 | SELECTION_DICT,
27 | SUB_HEADER,
28 | )
29 | from llmadmin.frontend.javascript_loader import JavaScriptLoader
30 | from llmadmin.frontend.leaderboard import DummyLeaderboard, Leaderboard
31 | from llmadmin.frontend.mongo_secrets import get_mongo_secret_url
32 | from llmadmin.frontend.utils import (
33 | DEFAULT_STATS,
34 | LOGGER,
35 | THEME,
36 | blank,
37 | deactivate_buttons,
38 | gen_stats,
39 | log_flags,
40 | paused_logger,
41 | select_button,
42 | unset_buttons,
43 | )
44 |
45 | std_logger = logging.getLogger("ray.logger")
46 |
47 | @ray.remote(num_cpus=0)
48 | def completions(bakend, prompt, llm, index):
49 | try:
50 | out = bakend.completions(prompt=prompt, llm=llm)
51 | except Exception as e:
52 | if isinstance(e, requests.ReadTimeout) or (
53 | hasattr(e, "response")
54 | and ("timeout" in e.response or e.response.status_code in (408, 504))
55 | ):
56 | out = (
57 | "[LLM-ADMIN] The request timed out. This usually means the server "
58 | "is experiencing a higher than usual load. "
59 | "Please try again in a few minutes."
60 | )
61 | elif hasattr(e, "response"):
62 | out = (
63 | f"[LLM-ADMIN] Backend returned an error. "
64 | f"Status code: {e.response.status_code}"
65 | f"\nResponse: {e.response.text.split('raise ')[-1]}"
66 | ).replace("\n", " ")
67 | else:
68 | out = f"[LLM-ADMIN] An error occurred. Please try again.\nError: {e}"
69 | out = {"error": out}
70 | return out, index
--------------------------------------------------------------------------------
/llmadmin/frontend/javascript/llmadmin.js:
--------------------------------------------------------------------------------
1 | // Set favicon
2 | const FAVICON =
3 | "data:image/svg+xml,";
4 | function setFavicon(link) {
5 | let favicon = document.querySelector('link[rel="icon"]');
6 |
7 | if (favicon) {
8 | favicon.href = link;
9 | } else {
10 | favicon = document.createElement("link");
11 | favicon.rel = "icon";
12 | favicon.href = link;
13 |
14 | document.head.appendChild(favicon);
15 | }
16 | }
17 | // setFavicon(FAVICON);
18 |
19 | // Get news
20 | const NEWS_URL = "https://api.github.com/repos/ray-project/llmadmin/issues/8";
21 | function getNews(newsUrl) {
22 | return fetch(newsUrl)
23 | .then((response) => {
24 | if (!response.ok) {
25 | throw new Error("Unable to fetch news.");
26 | }
27 | return response.text();
28 | })
29 | .then((data) => {
30 | return (title = JSON.parse(data)["title"]);
31 | })
32 | .catch((error) => console.error("Unable to parse response: ", error));
33 | }
34 |
35 | // Wait for the ticker div to be added to DOM to set the news content
36 | const observer = new MutationObserver((mutationsList, observer) => {
37 | for (let mutation of mutationsList) {
38 | if (mutation.type === "childList") {
39 | let element = document.getElementsByClassName("ticker");
40 | if (element.length > 0) {
41 | getNews(NEWS_URL).then((newsTitle) => {
42 | document.getElementsByClassName("ticker")[0].innerHTML =
43 | "\uD83D\uDCE3 " + newsTitle;
44 | });
45 | observer.disconnect();
46 | break;
47 | }
48 | }
49 | }
50 | });
51 |
52 | (function () {
53 | // Add Google Tag Manager
54 | const head = document.getElementsByTagName("head")[0];
55 | var gtm = document.createElement("script");
56 | gtm.text =
57 | "(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-5ZPDX2P');";
58 | head.insertBefore(gtm, head.children[0]);
59 |
60 | document.addEventListener("DOMContentLoaded", function () {
61 | observer.observe(document.body, { childList: true, subtree: true });
62 | });
63 | })();
--------------------------------------------------------------------------------
/llmadmin/frontend/utils.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | # import gradio as gr
4 |
5 | from llmadmin.common.constants import (
6 | G5_COST_PER_S_IN_DOLLARS,
7 | NUM_LLM_OPTIONS,
8 | PROJECT_NAME,
9 | )
10 | from llmadmin.frontend.mongo_logger import MongoLogger
11 | from llmadmin.frontend.mongo_secrets import get_mongo_secret_url
12 |
13 | LOGGER = None
14 |
15 | # MONGODB_URL = get_mongo_secret_url()
16 | # if MONGODB_URL:
17 | # LOGGER = MongoLogger(url=MONGODB_URL, project_name=PROJECT_NAME)
18 | # else:
19 | # print("No MongoDB logger defined, will default to the CSVLogger")
20 | # LOGGER = gr.CSVLogger()
21 | # LOGGER = gr.CSVLogger()
22 |
23 |
24 | DEFAULT_STATS = t = """
25 | | | |
26 | |---|---|
27 | | Latency [s] | - |
28 | | Cost [$] | - |
29 | | Tokens (i/o) | - |
30 | | Per 1K Tokens [$] | - |
31 | """
32 |
33 |
34 | def gen_stats(dictionary):
35 | cost_per_k = (
36 | dictionary["total_time"]
37 | * G5_COST_PER_S_IN_DOLLARS
38 | / dictionary["num_total_tokens"]
39 | * 1000
40 | )
41 |
42 | return f"""
43 | | | |
44 | |---|---|
45 | | Lat [s] | {dictionary['total_time']:.1f} |
46 | | Cost [$] | {dictionary['total_time'] * G5_COST_PER_S_IN_DOLLARS:.4f} |
47 | | Tokens (i/o) | {dictionary['num_total_tokens']:.1f} |
48 | | Per 1K Tok [$] | {cost_per_k:.4f} |
49 | """
50 |
51 |
52 | def blank():
53 | return ""
54 |
55 |
56 | # def select_button(button):
57 | # return button, gr.Button.update(variant="primary")
58 |
59 |
60 | # def deactivate_buttons():
61 | # return [gr.Button.update(interactive=False)] * NUM_LLM_OPTIONS
62 |
63 |
64 | # def unset_buttons():
65 | # return [gr.Button.update(variant="secondary", interactive=True)] * NUM_LLM_OPTIONS
66 |
67 |
68 | # def paused_logger(*args):
69 | # time.sleep(1)
70 | # LOGGER.flag(*args)
71 |
72 |
73 | # def log_flags(*args):
74 | # LOGGER.flag(args)
75 |
76 |
77 | # THEME = gr.themes.Default(
78 | # primary_hue="blue",
79 | # secondary_hue="blue",
80 | # ).set(
81 | # border_color_accent="blue",
82 | # shadow_spread="20",
83 | # shadow_spread_dark="0",
84 | # button_primary_background_fill="*primary_200",
85 | # button_primary_background_fill_dark="*primary_700",
86 | # button_primary_border_color_dark="*primary_600",
87 | # )
88 |
--------------------------------------------------------------------------------
/llmadmin/backend/server/_batch.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from dataclasses import dataclass, field
3 | from enum import IntEnum
4 | from functools import wraps
5 | from typing import Any, Callable, List, Optional, Tuple, Type
6 |
7 | # TODO: Upstream to Serve.
8 |
9 |
10 | def extract_self_if_method_call(args: List[Any], func: Callable) -> Optional[object]:
11 | """Check if this is a method rather than a function.
12 |
13 | Does this by checking to see if `func` is the attribute of the first
14 | (`self`) argument under `func.__name__`. Unfortunately, this is the most
15 | robust solution to this I was able to find. It would also be preferable
16 | to do this check when the decorator runs, rather than when the method is.
17 |
18 | Returns the `self` object if it's a method call, else None.
19 |
20 | Arguments:
21 | args: arguments to the function/method call.
22 | func: the unbound function that was called.
23 | """
24 | if len(args) > 0:
25 | method = getattr(args[0], func.__name__, False)
26 | if method:
27 | wrapped = getattr(method, "__wrapped__", False)
28 | if wrapped and wrapped == func:
29 | return args[0]
30 |
31 | return None
32 |
33 |
34 | class QueuePriority(IntEnum):
35 | """Lower value = higher priority"""
36 |
37 | GENERATE_TEXT = 0
38 | BATCH_GENERATE_TEXT = 1
39 |
40 |
41 | @dataclass(order=True)
42 | class _PriorityWrapper:
43 | """Wrapper allowing for priority queueing of arbitrary objects."""
44 |
45 | obj: Any = field(compare=False)
46 | priority: int = field(compare=True)
47 |
48 |
49 | class PriorityQueueWithUnwrap(asyncio.PriorityQueue):
50 | def get_nowait(self) -> Any:
51 | # Get just the obj from _PriorityWrapper
52 | ret: _PriorityWrapper = super().get_nowait()
53 | return ret.obj
54 |
55 |
56 | def _validate_max_batch_size(max_batch_size):
57 | if not isinstance(max_batch_size, int):
58 | if isinstance(max_batch_size, float) and max_batch_size.is_integer():
59 | max_batch_size = int(max_batch_size)
60 | else:
61 | raise TypeError("max_batch_size must be integer >= 1")
62 |
63 | if max_batch_size < 1:
64 | raise ValueError("max_batch_size must be an integer >= 1")
65 |
66 |
67 | def _validate_batch_wait_timeout_s(batch_wait_timeout_s):
68 | if not isinstance(batch_wait_timeout_s, (float, int)):
69 | raise TypeError("batch_wait_timeout_s must be a float >= 0")
70 |
71 | if batch_wait_timeout_s < 0:
72 | raise ValueError("batch_wait_timeout_s must be a float >= 0")
73 |
74 |
75 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/sequenceclassification_glue_cola.py:
--------------------------------------------------------------------------------
1 | from ._base import Task
2 | from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
3 | from typing import Any
4 | import pandas as pd
5 | import evaluate
6 | import numpy as np
7 |
8 |
9 | class SequenceclassificationGlueCola(Task):
10 | AUTO_MODEL_CLASS = AutoModelForSequenceClassification
11 |
12 | DATASET_PATH = "glue"
13 | DATASET_NAME = "cola"
14 |
15 | def get_data_proprocess(self) -> Any:
16 | tokenizer = self.tokenizer
17 |
18 | # adopt python decorator TODO
19 | def preprocess_function(examples: pd.DataFrame):
20 | # examples = examples.to_dict("list")
21 | ret = tokenizer(examples["sentence"], truncation=True)
22 |
23 | # Add back the original columns
24 | ret = {**examples, **ret}
25 | return pd.DataFrame.from_dict(ret)
26 |
27 | return preprocess_function
28 |
29 | def get_compute_metrics(self) -> Any:
30 | DATASET_PATH = self.DATASET_PATH
31 | DATASET_NAME = self.DATASET_NAME
32 |
33 | def compute_metrics(eval_preds):
34 | metric = evaluate.load(DATASET_PATH, DATASET_NAME)
35 | logits, labels = eval_preds
36 | predictions = np.argmax(logits, axis=-1)
37 | return metric.compute(predictions=predictions, references=labels)
38 |
39 | return compute_metrics
40 |
41 | def get_data_collator(self) -> Any:
42 | data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
43 | return data_collator
44 |
45 | def training_key(self):
46 | """
47 | :return: Iterable[obj]
48 | A iterable of any object, that doc_to_text can handle
49 | """
50 | return "train"
51 |
52 | def validation_key(self):
53 | """
54 | :return: Iterable[obj]
55 | A iterable of any object, that doc_to_text can handle
56 | """
57 | return "validation"
58 |
59 | def getTrainDataSet(self):
60 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True)
61 |
62 | def getEvalDataSet(self):
63 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True)
64 |
65 | def getSmallTrainDataSet(self, len: int):
66 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
67 |
68 | def getSmallEvalDataSet(self, len: int):
69 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/sequenceclassification_glue_mrpc.py:
--------------------------------------------------------------------------------
1 | from ._base import Task
2 | from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
3 | from typing import Any
4 | import pandas as pd
5 | import evaluate
6 | import numpy as np
7 |
8 |
9 | class SequenceclassificationGlueMrpc(Task):
10 | AUTO_MODEL_CLASS = AutoModelForSequenceClassification
11 |
12 | DATASET_PATH = "glue"
13 | DATASET_NAME = "mrpc"
14 | FROM_PRETRAINED_KWARGS = {
15 | # "num_labels": 2
16 | }
17 |
18 | def get_data_proprocess(self) -> Any:
19 | tokenizer = self.tokenizer
20 |
21 | # adopt python decorator TODO
22 | def preprocess_function(examples: pd.DataFrame):
23 | # examples = examples.to_dict("list")
24 | ret = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=self.ft_config.train_config.base_config.max_length)
25 |
26 | # Add back the original columns
27 | ret = {**examples, **ret}
28 | return pd.DataFrame.from_dict(ret)
29 |
30 | return preprocess_function
31 |
32 | def get_compute_metrics(self) -> Any:
33 | DATASET_PATH = self.DATASET_PATH
34 | DATASET_NAME = self.DATASET_NAME
35 |
36 | def compute_metrics(eval_preds):
37 | metric = evaluate.load(DATASET_PATH, DATASET_NAME)
38 | logits, labels = eval_preds
39 | predictions = np.argmax(logits, axis=-1)
40 | return metric.compute(predictions=predictions, references=labels)
41 |
42 | return compute_metrics
43 |
44 | def get_data_collator(self) -> Any:
45 | data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
46 | return data_collator
47 |
48 | def training_key(self):
49 | """
50 | :return: Iterable[obj]
51 | A iterable of any object, that doc_to_text can handle
52 | """
53 | return "train"
54 |
55 | def validation_key(self):
56 | """
57 | :return: Iterable[obj]
58 | A iterable of any object, that doc_to_text can handle
59 | """
60 | return "validation"
61 |
62 | def getTrainDataSet(self):
63 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True)
64 |
65 | def getEvalDataSet(self):
66 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True)
67 |
68 | def getSmallTrainDataSet(self, len: int):
69 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
70 |
71 | def getSmallEvalDataSet(self, len: int):
72 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
73 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/initializers/_base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Tuple
3 |
4 | import torch
5 | from transformers import PreTrainedModel, PreTrainedTokenizer
6 |
7 | from llmadmin.backend.logger import get_logger
8 |
9 | logger = get_logger(__name__)
10 |
11 |
12 | class LLMInitializer(ABC):
13 | """Initialize model and tokenizer and place them on the correct device.
14 |
15 | Args:
16 | device (torch.device): Device to place model and tokenizer on.
17 | world_size (int): Number of GPUs to use.
18 | """
19 |
20 | def __init__(
21 | self,
22 | device: torch.device,
23 | world_size: int,
24 | ):
25 | self.device = device
26 | self.world_size = world_size
27 |
28 | def load(self, model_id: str) -> Tuple["PreTrainedModel", "PreTrainedTokenizer"]:
29 | """Load model and tokenizer.
30 |
31 | Args:
32 | model_id (str): Hugging Face model ID.
33 | """
34 | model = self.load_model(model_id)
35 | tokenizer = self.load_tokenizer(model_id)
36 | return self.postprocess(model, tokenizer)
37 |
38 | @abstractmethod
39 | def load_model(self, model_id: str) -> "PreTrainedModel":
40 | """Load model.
41 |
42 | Args:
43 | model_id (str): Hugging Face model ID.
44 | """
45 | pass
46 |
47 | @abstractmethod
48 | def load_tokenizer(self, tokenizer_id: str) -> "PreTrainedTokenizer":
49 | """Load tokenizer.
50 |
51 | Args:
52 | tokenizer_id (str): Hugging Face tokenizer name.
53 | """
54 | pass
55 |
56 | def postprocess(
57 | self, model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer"
58 | ) -> Tuple["PreTrainedModel", "PreTrainedTokenizer"]:
59 | """Postprocess model and tokenizer.
60 |
61 | Args:
62 | model (PreTrainedModel): Model to postprocess.
63 | tokenizer (PreTrainedTokenizer): Tokenizer to postprocess.
64 | """
65 | return self.postprocess_model(model), self.postprocess_tokenizer(tokenizer)
66 |
67 | def postprocess_model(self, model: "PreTrainedModel") -> "PreTrainedModel":
68 | """Postprocess model.
69 |
70 | Args:
71 | model (PreTrainedModel): Model to postprocess.
72 | """
73 | return model
74 |
75 | def postprocess_tokenizer(
76 | self, tokenizer: "PreTrainedTokenizer"
77 | ) -> "PreTrainedTokenizer":
78 | """Postprocess tokenizer.
79 |
80 | Args:
81 | tokenizer (PreTrainedTokenizer): Tokenizer to postprocess.
82 | """
83 | return tokenizer
84 |
85 | def get_model_init_kwargs(self) -> dict:
86 | """Load tokenizer.
87 |
88 | Args:
89 | tokenizer_id (str): Hugging Face tokenizer name.
90 | """
91 | return {}
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/sequenceclassification_yelp_review_full.py:
--------------------------------------------------------------------------------
1 | from ._base import Task
2 | from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
3 | from typing import Any
4 | import pandas as pd
5 | import evaluate
6 | import numpy as np
7 |
8 |
9 | class SequenceclassificationYelpReviewFull(Task):
10 | AUTO_MODEL_CLASS = AutoModelForSequenceClassification
11 |
12 | DATASET_PATH = "yelp_review_full"
13 | DATASET_NAME = ""
14 | FROM_PRETRAINED_KWARGS = {
15 | "num_labels": 5
16 | }
17 |
18 | def get_data_proprocess(self) -> Any:
19 | tokenizer = self.tokenizer
20 |
21 | # adopt python decorator TODO
22 | def preprocess_function(examples: pd.DataFrame):
23 | examples = examples.to_dict("list")
24 | ret = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
25 |
26 | # Add back the original columns
27 | ret = {**examples, **ret}
28 | return pd.DataFrame.from_dict(ret)
29 |
30 | return preprocess_function
31 |
32 | def get_compute_metrics(self) -> Any:
33 | DATASET_PATH = self.DATASET_PATH
34 | DATASET_NAME = self.DATASET_NAME
35 |
36 | def compute_metrics(eval_preds):
37 | # metric = evaluate.load(DATASET_PATH, DATASET_NAME)
38 | metric = evaluate.load("accuracy")
39 | logits, labels = eval_preds
40 | predictions = np.argmax(logits, axis=-1)
41 | return metric.compute(predictions=predictions, references=labels)
42 |
43 | return compute_metrics
44 |
45 | def get_data_collator(self) -> Any:
46 | data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
47 | return data_collator
48 |
49 | def training_key(self):
50 | """
51 | :return: Iterable[obj]
52 | A iterable of any object, that doc_to_text can handle
53 | """
54 | return "train"
55 |
56 | def validation_key(self):
57 | """
58 | :return: Iterable[obj]
59 | A iterable of any object, that doc_to_text can handle
60 | """
61 | return "validation"
62 |
63 | def tokenize_function(self, examples):
64 | return self.tokenizer(examples["text"], padding="max_length", truncation=True, max_length=self.ft_config.train_config.base_config.max_length)
65 |
66 | def getTrainDataSet(self):
67 | return self.dataset[self.training_key()].map(self.tokenize_function, batched=True)
68 |
69 | def getEvalDataSet(self):
70 | return self.dataset[self.validation_key()].map(self.tokenize_function, batched=True)
71 |
72 | def getSmallTrainDataSet(self, len: int):
73 | return self.dataset[self.training_key()].select(range(len)).map(self.tokenize_function, batched=True)
74 |
75 | def getSmallEvalDataSet(self, len: int):
76 | return self.dataset[self.validation_key()].select(range(len)).map(self.tokenize_function, batched=True)
77 |
--------------------------------------------------------------------------------
/llmadmin/frontend/mongo_logger.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | from datetime import datetime, timezone
3 | from typing import Any
4 |
5 | # from gradio import FlaggingCallback
6 | from pymongo import MongoClient
7 |
8 | from llmadmin.common.constants import COLLECTION_NAME, DB_NAME
9 | from llmadmin.common.llm_event import LlmEvent, LlmResponse, Vote
10 |
11 |
12 | # class MongoLogger(FlaggingCallback):
13 | # """Logs flagged events to Mongo DB."""
14 |
15 | # def __init__(self, url, project_name) -> None:
16 | # self.url = url
17 | # self.client = MongoClient(url)
18 | # self.project_name = project_name
19 | # self.components = None
20 | # try:
21 | # self.client.admin.command("ping")
22 | # print("Pinged MongoDB. Correctly set up")
23 | # except Exception as e:
24 | # print(e)
25 |
26 | # def setup(self, components):
27 | # self.components = components
28 | # # Check if the database exists
29 | # if DB_NAME in self.client.list_database_names():
30 | # self.db = self.client[DB_NAME]
31 | # print(f"Database '{DB_NAME}' already exists.")
32 | # else:
33 | # # The database doesn't exist, so create it
34 | # self.db = self.client[DB_NAME]
35 | # print(f"Database '{DB_NAME}' created.")
36 |
37 | # # OK, now we create a collection.
38 | # # Check if the collection exists
39 | # if COLLECTION_NAME in self.db.list_collection_names():
40 | # # The collection exists
41 | # print(
42 | # f"Collection '{COLLECTION_NAME}' already exists in database '{DB_NAME}'."
43 | # )
44 | # else:
45 | # # The collection doesn't exist, so create it
46 | # self.db.create_collection(COLLECTION_NAME)
47 | # print(f"Collection '{COLLECTION_NAME}' created in database '{DB_NAME}'.")
48 |
49 | # def flag(self, flag_data: list[Any], flag_option: str = "", username: str = ""):
50 | # print(f"last value is: {flag_data}")
51 | # event = LlmEvent(
52 | # project_name=self.project_name,
53 | # created_at=datetime.now(timezone.utc),
54 | # instance_id=str(uuid.uuid4()),
55 | # user_prompt=flag_data[0],
56 | # # TODO(mwk): Work out how to generalize this to _n_ inputs
57 | # responses=[
58 | # LlmResponse(
59 | # model_id=flag_data[1], text=flag_data[4], gen_stats=flag_data[8][0]
60 | # ),
61 | # LlmResponse(
62 | # model_id=flag_data[2], text=flag_data[5], gen_stats=flag_data[8][1]
63 | # ),
64 | # LlmResponse(
65 | # model_id=flag_data[3], text=flag_data[6], gen_stats=flag_data[8][2]
66 | # ),
67 | # ],
68 | # session_id=flag_data[9],
69 | # )
70 | # if flag_data[7]:
71 | # vote_number = int(flag_data[7][-1])
72 | # event.votes = Vote(llm=flag_data[vote_number], score=1)
73 |
74 | # print(f"Event is {event.json()}")
75 | # result = self.client[DB_NAME][COLLECTION_NAME].insert_one(event.dict())
76 | # print(f"Mongo result {result}")
77 |
--------------------------------------------------------------------------------
/llmadmin/common/evaluation.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import requests
4 |
5 |
6 | class GPT:
7 | """A simple wrapper around the OpenAI API for evaluating GPT models."""
8 |
9 | def __init__(self, model_version="gpt-4", temperature=0.9, max_tokens=2048):
10 | api_key = os.getenv("GPT4_API_KEY")
11 | assert api_key, "Please set the GPT4_API_KEY environment variable"
12 | self.__api_key = os.getenv("GPT4_API_KEY")
13 | self.temperature = temperature
14 | self.max_tokens = max_tokens
15 | self.model = model_version
16 |
17 | def evaluate_results(self, prompt, results):
18 | """Evaluate a list of results generated by several models on a single prompt."""
19 | for result in results:
20 | result.pop("stats", None)
21 |
22 | gpt_messages = [
23 | {
24 | "role": "system",
25 | "content": (
26 | """You are an assistant tasked with ranking responses in
27 | order of quality, creating a leaderboard of all models.
28 | The best model has rank 1, the second best has rank 2, etc.
29 | You have to assess the quality of the responses, and rank them."""
30 | ),
31 | },
32 | {
33 | "role": "user",
34 | "content": (
35 | f"""You are given a prompt and a list of responses
36 | from several models in Python dictionary format.
37 | Specifically, the format of the results is as follows:
38 |
39 | 'model': , 'result':
40 |
41 | Your job is to "rank" the responses in order of quality, (not by
42 | the order in which they were generated).
43 |
44 | The prompt is: {prompt}
45 | The responses are: {results}
46 |
47 | Please rank the responses by quality, and return a list of the model
48 | names and ranks, i.e produce the following output:
49 |
50 | 'model': , 'rank':
51 |
52 | Only output this format, and nothing else. Your response must
53 | be a valid Python dictionary.
54 | Think step by step and give me this quality ranking.
55 | """
56 | ),
57 | },
58 | ]
59 | return self.generate(gpt_messages)
60 |
61 | def generate(self, messages):
62 | data = {
63 | "model": self.model,
64 | "messages": messages,
65 | "max_tokens": self.max_tokens,
66 | "temperature": self.temperature,
67 | }
68 | headers = {
69 | "Content-Type": "application/json",
70 | "Authorization": f"Bearer {self.__api_key}",
71 | }
72 | resp = requests.post(
73 | url="https://api.openai.com/v1/chat/completions", json=data, headers=headers
74 | )
75 |
76 | if not resp.ok:
77 | raise RuntimeError(f"Failed to generate: {resp.reason}")
78 |
79 | return resp.json()["choices"][0]["message"]["content"]
80 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/processors.py:
--------------------------------------------------------------------------------
1 | from typing import List, Union
2 |
3 | import torch
4 | from transformers import LogitsProcessor, StoppingCriteria
5 |
6 | from llmadmin.backend.logger import get_logger
7 |
8 | logger = get_logger(__name__)
9 |
10 |
11 | class StopOnTokens(StoppingCriteria):
12 | """
13 | Stopping criteria to allow stopping on multi-token sequences.
14 |
15 | ``first_stopping_token_in_batch`` attribute can be used for postprocessing after
16 | generation.
17 |
18 | Args:
19 | stopping_sequences (List[Union[List[int], int]]): List of sequences to stop on.
20 | """
21 |
22 | def __init__(self, stopping_sequences: List[Union[List[int], int]]) -> None:
23 | self.stopping_sequences = stopping_sequences
24 | self.stop_ids = [
25 | torch.LongTensor([stop_id] if not isinstance(stop_id, list) else stop_id)
26 | for stop_id in self.stopping_sequences
27 | ]
28 | self.first_stopping_token_in_batch = {}
29 |
30 | def __call__(
31 | self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
32 | ) -> bool:
33 | for batch_index, batch in enumerate(input_ids):
34 | if batch_index not in self.first_stopping_token_in_batch:
35 | for stop_id in self.stop_ids:
36 | if len(batch) > len(stop_id) and batch[-len(stop_id) :].equal(
37 | stop_id.to(batch.device)
38 | ):
39 | self.first_stopping_token_in_batch[batch_index] = len(batch) - 1
40 | break
41 | return len(self.first_stopping_token_in_batch) == len(input_ids)
42 |
43 |
44 | class StopOnTokensLogitsProcessor(LogitsProcessor):
45 | """
46 | Processor to force only EOS token after encountering a stopping sequence.
47 |
48 | Args:
49 | stopping_sequences (List[Union[List[int], int]]): List of sequences to stop on.
50 | eos_token_id (Union[int, List[int]]): EOS token id(s).
51 | """
52 |
53 | def __init__(
54 | self,
55 | stopping_sequences: List[Union[List[int], int]],
56 | eos_token_id: Union[int, List[int]],
57 | ) -> None:
58 | if isinstance(eos_token_id, int):
59 | eos_token_id = [eos_token_id]
60 | self.eos_token_id = eos_token_id
61 | self.stop_ids = [
62 | torch.LongTensor([stop_id] if not isinstance(stop_id, list) else stop_id)
63 | for stop_id in stopping_sequences
64 | ]
65 | self._stopped_batches = set()
66 | self._nulled_batch = None
67 |
68 | def __call__(
69 | self, input_ids: torch.LongTensor, scores: torch.FloatTensor
70 | ) -> torch.FloatTensor:
71 | for batch_index, batch in enumerate(input_ids):
72 | if batch_index not in self._stopped_batches:
73 | for stop_id in self.stop_ids:
74 | if len(batch) > len(stop_id) and batch[-len(stop_id) :].equal(
75 | stop_id.to(batch.device)
76 | ):
77 | self._stopped_batches.add(batch_index)
78 | break
79 | if batch_index in self._stopped_batches:
80 | if self._nulled_batch is None:
81 | scores[batch_index, :] = -float("inf")
82 | scores[batch_index, self.eos_token_id] = 0
83 | self._nulled_batch = scores[batch_index]
84 | scores[batch_index] = self._nulled_batch
85 | return scores
86 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/initializers/llamacpp.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
3 |
4 | import torch
5 | from huggingface_hub import hf_hub_download
6 |
7 | from llmadmin.backend.logger import get_logger
8 |
9 | from ._base import LLMInitializer
10 |
11 | if TYPE_CHECKING:
12 | from llama_cpp import Llama
13 |
14 | logger = get_logger(__name__)
15 |
16 |
17 | class LlamaCppTokenizer:
18 | """Thin wrapper around a llama_cpp model to provide a subset of the PreTrainedTokenizer interface"""
19 |
20 | def __init__(self, model: "Llama") -> None:
21 | self.model = model
22 |
23 | def decode(self, tokens: Union[List[int], List[List[int]]], **kwargs) -> str:
24 | if not tokens:
25 | return tokens
26 | if isinstance(tokens[0], int):
27 | return self.model.detokenize(tokens).decode("utf-8")
28 | return [self.decode(t) for t in tokens]
29 |
30 | def encode(self, text: Union[str, List[str], List[List[str]]], **kwargs) -> str:
31 | if isinstance(text, str):
32 | return self.model.tokenize(text.encode("utf-8"))
33 | return [self.encode(t) for t in text]
34 |
35 | def batch_encode(self, text: Union[List[str], List[List[str]]], **kwargs) -> str:
36 | return self.encode(text)
37 |
38 | def __call__(self, text: Union[str, List[str], List[List[str]]], **kwargs):
39 | return self.encode(text, **kwargs)
40 |
41 |
42 | class LlamaCppInitializer(LLMInitializer):
43 | """Initialize llama_cpp model and tokenizer.
44 |
45 | Args:
46 | device (torch.device): Device to place model and tokenizer on.
47 | world_size (int): Number of GPUs to use.
48 | model_filename (str): Name of the model file to download from HuggingFace Hub.
49 | This needs to be in the ``model_id`` repository (passed to ``self.load()``).
50 | **model_init_kwargs: Keyword arguments to pass to the llama_cpp model init.
51 | """
52 |
53 | def __init__(
54 | self,
55 | device: torch.device,
56 | world_size: int,
57 | model_filename: str,
58 | **model_init_kwargs,
59 | ):
60 | super().__init__(
61 | device=device,
62 | world_size=world_size,
63 | )
64 | self.model_filename = model_filename
65 | self.model_init_kwargs = model_init_kwargs
66 |
67 | def _get_model_init_kwargs(self) -> Dict[str, Any]:
68 | return {
69 | # We use a large integer to put all of the layers on GPU by default.
70 | "n_gpu_layers": 0 if self.device.type == "cpu" else 10**6,
71 | "seed": 0,
72 | "verbose": False,
73 | "n_threads": int(os.environ["OMP_NUM_THREADS"]),
74 | **self.model_init_kwargs,
75 | }
76 |
77 | def load_model(self, model_id: str) -> "Llama":
78 | logger.info(f"LlamaCppInitializer downloading {model_id} : {self.model_filename}")
79 | model_path = hf_hub_download(model_id, self.model_filename)
80 | logger.info(f"LlamaCppInitializer Loading model {model_path}")
81 | # Lazy import to avoid issues on CPU head node
82 | from llama_cpp import Llama
83 |
84 | return Llama(
85 | model_path=os.path.abspath(model_path),
86 | **self._get_model_init_kwargs(),
87 | )
88 |
89 | def load_tokenizer(self, tokenizer_name: str) -> None:
90 | return None
91 |
92 | def postprocess(
93 | self, model: "Llama", tokenizer: None
94 | ) -> Tuple["Llama", LlamaCppTokenizer]:
95 | return super().postprocess(model, LlamaCppTokenizer(model))
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/_base.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from abc import abstractmethod
3 | from typing import Any
4 | from llmadmin.backend.server.models import DataConfig
5 | from datasets import load_dataset
6 | from datasets import load_metric
7 | import transformers
8 | from transformers import PreTrainedTokenizer, PreTrainedModel
9 | from typing import Any, Dict
10 | from llmadmin.backend.server.models import FTConfig
11 | from llmadmin.backend.logger import get_logger
12 |
13 | logger = get_logger(__name__)
14 |
15 | class Task(abc.ABC):
16 | AUTO_MODEL_CLASS: transformers.AutoModel = None
17 |
18 | # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
19 | # or a path to a custom `datasets` loading script.
20 | DATASET_PATH: str = None
21 |
22 | # The name of a subset within `DATASET_PATH`.
23 | DATASET_NAME: str = None
24 |
25 | # kwargs when build model with transformer's "from_pretrained"
26 | FROM_PRETRAINED_KWARGS: Dict[str, Any] = None
27 |
28 | def __init__(
29 | self,
30 | tokenizer: "PreTrainedTokenizer",
31 | ft_config: "FTConfig",
32 | ) -> None:
33 | self.tokenizer = tokenizer
34 | self.ft_config = ft_config
35 | self.download_dataset()
36 | self._pre()
37 |
38 | @classmethod
39 | def from_tokenizer(
40 | cls,
41 | tokenizer: "PreTrainedTokenizer",
42 | ft_config: "FTConfig",
43 | ) -> "Task":
44 | fac = cls(
45 | tokenizer = tokenizer,
46 | ft_config = ft_config
47 | )
48 |
49 | return fac
50 |
51 | @abstractmethod
52 | def get_data_proprocess(self) -> Any:
53 | """Change trainning data to tensor model can accepted"""
54 | pass
55 |
56 | @abstractmethod
57 | def get_compute_metrics(self) -> Any:
58 | pass
59 |
60 | @abstractmethod
61 | def get_data_collator(self) -> Any:
62 | pass
63 |
64 | def _pre(self) -> Any:
65 | pass
66 |
67 | @abstractmethod
68 | def training_key(self):
69 | """
70 | :return: Iterable[obj]
71 | A iterable of any object, that doc_to_text can handle
72 | """
73 | pass
74 |
75 | @abstractmethod
76 | def validation_key(self):
77 | """
78 | :return: Iterable[obj]
79 | A iterable of any object, that doc_to_text can handle
80 | """
81 | pass
82 |
83 | @abstractmethod
84 | def getTrainDataSet(self):
85 | pass
86 |
87 | @abstractmethod
88 | def getEvalDataSet(self):
89 | pass
90 |
91 | @abstractmethod
92 | def getSmallTrainDataSet(self, len: int):
93 | pass
94 |
95 | @abstractmethod
96 | def getSmallEvalDataSet(self, len: int):
97 | pass
98 |
99 | def get_dataset(self):
100 | return self.dataset
101 |
102 | def download_dataset(self):
103 | # Downloading and loading a dataset from the hub.
104 | logger.info("Start loading dataset")
105 | if self.ft_config.data_config.local_path:
106 | logger.info(f"Loading dataset from local path {self.ft_config.data_config.local_path}")
107 | raw_datasets = load_dataset(self.ft_config.data_config.local_path)
108 | else:
109 | if self.DATASET_NAME:
110 | logger.info(f"Downloading dataset {self.DATASET_NAME} from {self.DATASET_PATH}")
111 | raw_datasets = load_dataset(self.DATASET_PATH, self.DATASET_NAME)
112 | else:
113 | logger.info(f"Downloading dataset from {self.DATASET_PATH}")
114 | raw_datasets = load_dataset(self.DATASET_PATH)
115 | logger.info("Done load dataset")
116 | logger.info(f"{raw_datasets}")
117 | self.dataset = raw_datasets
118 |
119 | def set_model(self, model: PreTrainedModel):
120 | self.model = model
121 |
122 | def get_model(self):
123 | return self.model
--------------------------------------------------------------------------------
/llm_finetune_ray.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import subprocess
4 | import ray
5 | import ray.util.scheduling_strategies
6 |
7 |
8 | def force_on_node(node_id: str, remote_func_or_actor_class):
9 | scheduling_strategy = ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
10 | node_id=node_id, soft=False
11 | )
12 | options = {"scheduling_strategy": scheduling_strategy}
13 | return remote_func_or_actor_class.options(**options)
14 |
15 |
16 | def run_on_every_node(remote_func_or_actor_class, *remote_args, **remote_kwargs):
17 | refs = []
18 | for node in ray.nodes():
19 | if node["Alive"] and node["Resources"].get("GPU", None):
20 | refs.append(
21 | force_on_node(node["NodeID"], remote_func_or_actor_class).remote(
22 | *remote_args, **remote_kwargs
23 | )
24 | )
25 | return ray.get(refs)
26 |
27 |
28 | @ray.remote(num_gpus=1)
29 | def mount_nvme():
30 | if os.path.exists("/nvme"):
31 | return
32 | subprocess.run(
33 | 'drive_name="${1:-/dev/nvme1n1}"; mount_path="${2:-/nvme}"; set -x; sudo file -s "$drive_name"; sudo apt install xfsprogs -y; sudo mkfs -t xfs "$drive_name"; sudo mkdir "$mount_path" && sudo mount "$drive_name" "$mount_path" && sudo chown -R ray "$mount_path"',
34 | shell=True,
35 | check=True,
36 | )
37 |
38 |
39 | @ray.remote(num_gpus=1)
40 | def download_model(base_model_name=None):
41 | base_model_name = (
42 | base_model_name or "RWKV-4-Pile-1B5"
43 | ) # "RWKV-4-Pile-1B5", "RWKV-4-Pile-430M", "RWKV-4-Pile-169M"
44 | base_model_url = f"https://huggingface.co/BlinkDL/{base_model_name.lower()}"
45 | subprocess.run(
46 | f"cd /nvme; git lfs clone {base_model_url}; ls '{base_model_name.lower()}'",
47 | shell=True,
48 | check=True,
49 | )
50 |
51 |
52 | @ray.remote(num_gpus=1)
53 | def download_pile_remote(dataset_name):
54 | subprocess.run(
55 | "rm -rf /nvme/enwik8; rm -rf /nvme/data/pile/; rm -rf ~/gpt-neox",
56 | shell=True,
57 | check=True,
58 | )
59 | subprocess.run(
60 | "cd ~/; git clone https://github.com/Yard1/gpt-neox.git;", shell=True
61 | )
62 | subprocess.run(
63 | f"cd ~/; cd gpt-neox; echo 'starting dataset download {dataset_name}'; python prepare_data.py {dataset_name} -d /nvme/data/pile -t HFTokenizer --vocab-file '/mnt/cluster_storage/20B_tokenizer.json' && echo 'download complete'",
64 | shell=True,
65 | check=True,
66 | )
67 |
68 |
69 | def download_pile(dataset_name):
70 | subprocess.run(
71 | # Necessary for gpt-neox tokenizer to work
72 | "pip uninstall -y deepspeed && pip install --user -U git+https://github.com/EleutherAI/DeeperSpeed.git@eb7f5cff36678625d23db8a8fe78b4a93e5d2c75#egg=deepspeed",
73 | shell=True,
74 | )
75 | try:
76 | run_on_every_node(download_pile_remote, dataset_name=dataset_name)
77 | finally:
78 | subprocess.run(
79 | # Use latest deepspeed for actual training. Will crash otherwise
80 | "pip uninstall -y deepspeed && pip install -U --user deepspeed",
81 | shell=True,
82 | )
83 |
84 |
85 | @ray.remote(num_gpus=1)
86 | def clean_cache():
87 | subprocess.run("rm -rf ~/.cache/torch_extensions", shell=True, check=True)
88 |
89 |
90 | @ray.remote(num_gpus=1)
91 | def run(cmd: str):
92 | subprocess.run(cmd, shell=True, check=True)
93 |
94 |
95 | if __name__ == "__main__":
96 | parser = argparse.ArgumentParser()
97 |
98 | parser.add_argument("function", type=str, help="function in this file to run")
99 | parser.add_argument("args", nargs="*", type=str, help="string args to function")
100 | args = parser.parse_args()
101 |
102 | ray.init()
103 | if args.function not in globals():
104 | raise ValueError(f"{args.function} doesn't exist")
105 | fn = globals()[args.function]
106 | assert callable(fn) or hasattr(fn, "_function")
107 | print(f"Running {args.function}({', '.join(args.args)})")
108 | if hasattr(fn, "_function"):
109 | run_on_every_node(fn, *args.args)
110 | else:
111 | fn(*args.args)
112 |
--------------------------------------------------------------------------------
/llmadmin/backend/server/run.py:
--------------------------------------------------------------------------------
1 | # import sys
2 | from typing import Dict, List, Union
3 | import ray
4 | from llmadmin.backend.server.app import ApiServer
5 | from llmadmin.backend.server.config import SERVE_RUN_HOST
6 | from llmadmin.backend.server.models import FTApp
7 | from llmadmin.backend.server.utils import parse_args, parse_args_ft
8 | # import uuid
9 | # import os
10 | from llmadmin.backend.llm.ft import TransformersFT
11 | from llmadmin.backend.llm.ft import RayTrain
12 | from llmadmin.backend.logger import get_logger
13 | from ray.serve._private.constants import DEFAULT_HTTP_PORT
14 | from llmadmin.backend.server.utils import get_serve_port
15 | from ray import serve
16 |
17 | # ray.init(address="auto")
18 | logger = get_logger(__name__)
19 |
20 | def run_ray_ft(ft: Union[FTApp, str]):
21 | """Run the LLM Train on the local Ray Cluster
22 |
23 | Args:
24 | model: A LLMApp objects or paths to yaml files defining LLMApps
25 |
26 | Example:
27 | run("models/model.yaml") # run one model in the model directory
28 | run(FTApp) # run a single LLMApp
29 | """
30 |
31 | ft = parse_args_ft(ft)
32 | if not ft:
33 | raise RuntimeError("No valiabled fine tune defination were found.")
34 |
35 | if isinstance(ft, FTApp):
36 | logger.info(f"Initialized a Finetune instance of FTApp {ft.json(indent=2)}")
37 | else:
38 | raise RuntimeError("Not a Finetune App were found.")
39 |
40 | # ray._private.usage.usage_lib.record_library_usage("llmadmin")
41 |
42 | runner = RayTrain(ft)
43 | runner.train()
44 |
45 | def run_ft(ft: Union[FTApp, str]):
46 | """Run the LLM Server on the local Ray Cluster
47 |
48 | Args:
49 | model: A LLMApp objects or paths to yaml files defining LLMApps
50 |
51 | Example:
52 | run("models/model.yaml") # run one model in the model directory
53 | run(FTApp) # run a single LLMApp
54 | """
55 |
56 | ft = parse_args_ft(ft)
57 | if not ft:
58 | raise RuntimeError("No valiabled fine tune defination were found.")
59 |
60 | if isinstance(ft, FTApp):
61 | logger.info(f"Initialized a Finetune instance of FTApp {ft.json(indent=2)}")
62 | else:
63 | raise RuntimeError("Not a Finetune App were found.")
64 |
65 | ray._private.usage.usage_lib.record_library_usage("llmadmin")
66 |
67 | runner = TransformersFT(ft)
68 | runner.train()
69 |
70 | def start_apiserver(port: int = DEFAULT_HTTP_PORT, resource_config: str = None, scale_config: str = None):
71 | """Run the API Server on the local Ray Cluster
72 |
73 | Args:
74 | *host: The host ip to run.
75 | *port: The port to run.
76 |
77 | """
78 | scale_dict = dict()
79 | try:
80 | scale_dict = toDict(scale_config)
81 | except:
82 | raise ValueError(f"Invalid value of scale config '{scale_config}'")
83 | resource_dict = None
84 | try:
85 | resource_dict = toDict(resource_config)
86 | except:
87 | raise ValueError(f"Invalid value of resource config '{resource_config}'")
88 |
89 | # ray._private.usage.usage_lib.record_library_usage("llmfinetune")
90 | # ray.init(address="auto")
91 | serve_start_port = get_serve_start_port(port)
92 | app = ApiServer.options(autoscaling_config=scale_dict, ray_actor_options=resource_dict).bind()
93 | serve.start(http_options={"host": SERVE_RUN_HOST, "port": serve_start_port})
94 | logger.info(f"Serve 'apiserver' is running at {SERVE_RUN_HOST}/{serve_start_port}")
95 | logger.info(f"Serve 'apiserver' run with resource: {resource_dict} , scale: {scale_dict}")
96 | serve.run(app, name="apiserver", route_prefix="/api")
97 |
98 | # parse k1=v1,k2=v2 to dict
99 | def toDict(kv: str) -> Dict:
100 | if kv:
101 | s = kv.replace(' ', ', ')
102 | return eval(f"dict({s})")
103 | else:
104 | return dict()
105 |
106 | def get_serve_start_port(port: int):
107 | serve_start_port = port
108 | serve_runtime_port = get_serve_port()
109 | if serve_runtime_port > -1:
110 | logger.info(
111 | f"Serve is already running at {SERVE_RUN_HOST}:{serve_runtime_port}")
112 | serve_start_port = serve_runtime_port
113 | return serve_start_port
114 |
115 | # if __name__ == "__main__":
116 | # run_ft(*sys.argv[1:])
117 |
118 |
119 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # The build output should clearly not be checked in
2 | .llm-ray/
3 | *test-output.xml
4 | /bazel-*
5 | /python/ray/core
6 | /python/ray/pickle5_files/
7 | /python/ray/thirdparty_files/
8 | /python/ray/pyarrow_files/
9 | /python/ray/jars/
10 | /python/ray/cpp/
11 | /python/build
12 | /python/dist
13 | /python/python-driver-*
14 | /python/ray/serve/generated
15 | /thirdparty/pkg/
16 | /build/java
17 | .jar
18 | /dashboard/client/build
19 | finetune_models
20 |
21 | # Files generated by flatc should be ignored
22 | /src/ray/gcs/format/*_generated.h
23 | /src/ray/object_manager/format/*_generated.h
24 | /src/ray/raylet/format/*_generated.h
25 | /java/runtime/src/main/java/io/ray/runtime/generated/*
26 | /java/serve/src/main/java/io/ray/serve/generated/*
27 |
28 | # Files genrated by c++ worker should be ignored.
29 | /cpp/example/thirdparty/
30 | /cpp/example/bazel-*
31 | /python/ray/cpp
32 |
33 | # Redis temporary files
34 | *dump.rdb
35 |
36 | # Python byte code files
37 | *.pyc
38 | python/.eggs
39 |
40 | # Backup files
41 | *.bak
42 |
43 | # Emacs temporary files
44 | *~
45 | *#
46 |
47 | # Compiled Object files
48 | *.slo
49 | *.lo
50 | *.o
51 | *.xo
52 | *.obj
53 |
54 | # Precompiled Headers
55 | *.gch
56 | *.pch
57 |
58 | # Compiled Dynamic libraries
59 | *.so
60 | *.dylib
61 | *.dll
62 | python/ray/_raylet.pyd
63 |
64 | # Incremental linking files
65 | *.ilk
66 |
67 | # Library export files
68 | *.exp
69 |
70 | # Debug symbols
71 | *.pdb
72 |
73 | # Fortran module files
74 | *.mod
75 | !deploy/ray-operator/go.mod
76 |
77 | # Compiled Static libraries
78 | *.lai
79 | *.la
80 | *.a
81 | *.lib
82 |
83 | # Executables
84 | *.exe
85 | *.out
86 | *.app
87 |
88 | # Visual Studio files
89 | /packages
90 | *.suo
91 | *.user
92 | *.VC.db
93 | *.VC.opendb
94 |
95 | # Protobuf-generated files
96 | *_pb2.py
97 | *.pb.h
98 | *.pb.cc
99 |
100 | # Ray cluster configuration
101 | scripts/nodes.txt
102 |
103 | # OS X folder attributes
104 | .DS_Store
105 |
106 | # Debug files
107 | *.dSYM/
108 | *.su
109 |
110 | # Python setup files
111 | *.egg-info
112 |
113 | # Compressed files
114 | *.gz
115 |
116 | # Datasets from examples
117 | **/MNIST_data/
118 | **/cifar-10-batches-bin/
119 |
120 | # Generated documentation files
121 | /doc/_build
122 | /doc/source/_static/thumbs
123 | /doc/source/tune/generated_guides/
124 | /doc/source/**/doc/
125 |
126 | # User-specific stuff:
127 | .idea/**/workspace.xml
128 | .idea/**/tasks.xml
129 | .idea/dictionaries
130 | .llvm-local.bazelrc
131 |
132 | # Sensitive or high-churn files:
133 | .idea/**/dataSources/
134 | .idea/**/dataSources.ids
135 | .idea/**/dataSources.xml
136 | .idea/**/dataSources.local.xml
137 | .idea/**/sqlDataSources.xml
138 | .idea/**/dynamic.xml
139 | .idea/**/uiDesigner.xml
140 |
141 | # Gradle:
142 | .idea/**/gradle.xml
143 | .idea/**/libraries
144 | .idea
145 |
146 | # Website
147 | /site/Gemfile.lock
148 | /site/.sass-cache
149 | /site/_site
150 |
151 | # Pytest Cache
152 | **/.pytest_cache
153 | **/.cache
154 | .benchmarks
155 | python-driver-*
156 |
157 | # Vscode
158 | .vscode/
159 |
160 | *.iml
161 |
162 | # Java
163 | java/**/target
164 | java/**/lib
165 | java/**/.settings
166 | java/**/.classpath
167 | java/**/.project
168 | java/runtime/native_dependencies/
169 | java/testng_custom.xml
170 |
171 | dependency-reduced-pom.xml
172 |
173 | # Cpp
174 | cpp/example/thirdparty/
175 |
176 | .clwb
177 |
178 | # pom.xml files generated from pom_template.xml
179 | java/**/pom.xml
180 |
181 | # python virtual env
182 | venv
183 |
184 | # pyenv version file
185 | .python-version
186 |
187 | # Vim
188 | .*.swp
189 | *.swp
190 | .*.swo
191 | *.swo
192 | tags
193 | tags.lock
194 | tags.temp
195 | *.vim
196 |
197 | # Emacs
198 | .#*
199 |
200 | # tools
201 | tools/prometheus*
202 |
203 | # ray project files
204 | project-id
205 | .mypy_cache/
206 |
207 | # release test related
208 | .anyscale.yaml
209 | test_state.json
210 |
211 | # workflow storage
212 | workflow_data/
213 |
214 | # vscode java extention generated
215 | .factorypath
216 |
217 | # Jupyter Notebooks
218 | **/.ipynb_checkpoints/
219 |
220 | /external
221 | # Compiled output -> don't check in
222 | /compile_commands.json
223 | # Directory where clangd puts its indexing work
224 | /.cache/
225 |
226 | # Auto-generated tag mapping
227 | tag-mapping.json
228 |
229 | .bazeliskrc
230 |
231 | # ignore tmp files
232 | *.tmp
233 | deploy/anyscale/service.yaml
234 | out
235 |
236 | # build output
237 | build/
238 | dist/
239 |
240 | results/
241 | aviary-output.json
242 | evaluation-output.json
243 | prompts.txt
244 | hash
245 | __pycache__
246 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/utils.py:
--------------------------------------------------------------------------------
1 | from typing import List, Union, Tuple
2 |
3 | import torch
4 | from transformers import PreTrainedTokenizer
5 |
6 | from llmadmin.backend.server.models import Prompt
7 |
8 |
9 | def tokenize_string(tokenizer: PreTrainedTokenizer, key: str) -> Union[int, List[int]]:
10 | """Tokenize a string using a tokenizer.
11 |
12 | Args:
13 | tokenizer (PreTrainedTokenizer): Tokenizer to use.
14 | key (str): String to tokenize.
15 | """
16 | token_ids = tokenizer.encode(key, add_special_tokens=False)
17 | return token_ids[0] if len(token_ids) == 1 else token_ids
18 |
19 |
20 | def decode_tokens(tokenizer: PreTrainedTokenizer, tokens: Union[int, List[int]]) -> str:
21 | tokens = tokens if isinstance(tokens, list) else [tokens]
22 | text = tokenizer.decode(tokens)
23 | return text
24 |
25 |
26 | def truncate_to_first_stop_token(
27 | tokens: torch.LongTensor,
28 | stop_ids: List[Union[int, List[int]]],
29 | ) -> torch.LongTensor:
30 | """Truncate tokens up to the first stop_id.
31 |
32 | Args:
33 | tokens (torch.LongTensor): Tokens to truncate.
34 | stop_ids (List[Union[int, List[int]]]): Stop ids to truncate at. Can be
35 | composed of single stop ids or sequences of ids.
36 | """
37 | if not stop_ids:
38 | return tokens
39 | stop_ids: List[torch.LongTensor] = [
40 | torch.LongTensor([stop_id] if not isinstance(stop_id, list) else stop_id)
41 | for stop_id in stop_ids
42 | ]
43 | for i in range(len(tokens)):
44 | for stop_id_index, _ in enumerate(stop_ids):
45 | stop_id = stop_ids[stop_id_index].to(tokens.device)
46 | if len(tokens) - i >= len(stop_id) and tokens[i : len(stop_id) + i].equal(
47 | stop_id
48 | ):
49 | return tokens[:i]
50 | return tokens
51 |
52 |
53 |
54 | def _construct_prompt(prompt: Union[str, Prompt], prompt_format: str) -> str:
55 | if isinstance(prompt, Prompt):
56 | if prompt.use_prompt_format and prompt_format:
57 | return prompt_format.format(instruction=prompt.prompt)
58 | else:
59 | return prompt.prompt
60 | return prompt_format.format(instruction=prompt) if prompt_format else prompt
61 |
62 | def construct_prompts(
63 | prompts: Union[str, Prompt, List[str], List[Prompt], Tuple[str]],
64 | prompt_format: str,
65 | ) -> List[str]:
66 | """Construct prompts from a prompt string or list of prompts."""
67 | if not isinstance(prompts, list):
68 | prompts = [prompts]
69 | return [_construct_prompt(prompt, prompt_format) for prompt in prompts]
70 |
71 | def construct_prompts_experimental(
72 | prompts: Union[str, Prompt, List[str], List[Prompt], Tuple[str]],
73 | prompt_format: str,
74 | ) -> List[str]:
75 | """Construct prompts from a prompt string or list of prompts."""
76 | if not isinstance(prompts, list):
77 | prompts = [prompts]
78 |
79 | params = []
80 | for prompt in prompts:
81 | if isinstance(prompt, Prompt) and isinstance(prompt.prompt, Tuple):
82 | params += [_construct_prompt(prompt, prompt_format) for prompt in prompt.prompt]
83 | else:
84 | params.append(_construct_prompt(prompt, prompt_format))
85 | return params
86 |
87 |
88 | def tokenize_stopping_sequences_where_needed(
89 | tokenizer: PreTrainedTokenizer,
90 | stopping_sequences: List[Union[str, int, List[int]]],
91 | ) -> List[Union[List[int], int]]:
92 | """If any sequence is a string, tokenize it.
93 |
94 | Args:
95 | tokenizer (PreTrainedTokenizer): Tokenizer to use.
96 | stopping_sequences (List[Union[str, int, List[int]]]): Stopping sequences to
97 | tokenize. Can be ids, sequences of ids or strings.
98 | """
99 | if not stopping_sequences:
100 | return None
101 | return [
102 | tokenize_string(tokenizer, sequence) if isinstance(sequence, str) else sequence
103 | for sequence in stopping_sequences
104 | ]
105 |
106 |
107 | def decode_stopping_sequences_where_needed(
108 | tokenizer: PreTrainedTokenizer,
109 | stopping_sequences: List[Union[str, int, List[int]]],
110 | ) -> List[str]:
111 | """If any sequence is a string, tokenize it."""
112 | if not stopping_sequences:
113 | return None
114 | return [
115 | decode_tokens(tokenizer, sequence)
116 | if not isinstance(sequence, str)
117 | else sequence
118 | for sequence in stopping_sequences
119 | ]
120 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/maskedlm_imdb.py:
--------------------------------------------------------------------------------
1 | from ._base import Task
2 | from transformers import AutoModelForMaskedLM
3 | from typing import Any
4 | import pandas as pd
5 | import numpy as np
6 |
7 |
8 |
9 | class MaskedLMImdb(Task):
10 | AUTO_MODEL_CLASS = AutoModelForMaskedLM
11 |
12 | DATASET_PATH = "imdb"
13 |
14 | def get_data_proprocess(self) -> Any:
15 | tokenizer = self.tokenizer
16 |
17 | def group_texts(examples):
18 | # Concatenate all texts
19 | concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
20 | # Compute length of concatenated texts
21 | total_length = len(concatenated_examples[list(examples.keys())[0]])
22 | # We drop the last chunk if it's smaller than chunk_size
23 | total_length = (total_length // chunk_size) * chunk_size
24 | # Split by chunks of max_len
25 | result = {
26 | k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
27 | for k, t in concatenated_examples.items()
28 | }
29 | # Create a new labels column
30 | result["labels"] = result["input_ids"].copy()
31 | return result
32 |
33 |
34 | chunk_size = 128
35 | # adopt python decorator TODO
36 | def preprocess_function(examples: pd.DataFrame):
37 | # examples = examples.to_dict("list")
38 | result = tokenizer(examples["text"])
39 | if tokenizer.is_fast:
40 | result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
41 |
42 | tokenized_inputs = group_texts(result)
43 |
44 | # Add back the original columns
45 | ret = {**tokenized_inputs}
46 | return pd.DataFrame.from_dict(ret)
47 |
48 | return preprocess_function
49 |
50 | def get_data_collator(self) -> Any:
51 | import collections
52 | import numpy as np
53 | from transformers import default_data_collator
54 |
55 | wwm_probability = 0.2
56 | tokenizer = self.tokenizer
57 | def whole_word_masking_data_collator(features):
58 | for feature in features:
59 | word_ids = feature.pop("word_ids")
60 |
61 | # Create a map between words and corresponding token indices
62 | mapping = collections.defaultdict(list)
63 | current_word_index = -1
64 | current_word = None
65 | for idx, word_id in enumerate(word_ids):
66 | if word_id is not None:
67 | if word_id != current_word:
68 | current_word = word_id
69 | current_word_index += 1
70 | mapping[current_word_index].append(idx)
71 |
72 | # Randomly mask words
73 | mask = np.random.binomial(1, wwm_probability, (len(mapping),))
74 | input_ids = feature["input_ids"]
75 | labels = feature["labels"]
76 | new_labels = [-100] * len(labels)
77 | for word_id in np.where(mask)[0]:
78 | word_id = word_id.item()
79 | for idx in mapping[word_id]:
80 | new_labels[idx] = labels[idx]
81 | input_ids[idx] = tokenizer.mask_token_id
82 | feature["labels"] = new_labels
83 |
84 | return default_data_collator(features)
85 |
86 | return whole_word_masking_data_collator
87 |
88 | def get_compute_metrics(self) -> Any:
89 | return None
90 |
91 | def training_key(self):
92 | """
93 | :return: Iterable[obj]
94 | A iterable of any object, that doc_to_text can handle
95 | """
96 | return "train"
97 |
98 | def validation_key(self):
99 | """
100 | :return: Iterable[obj]
101 | A iterable of any object, that doc_to_text can handle
102 | """
103 | return "test"
104 |
105 | def getTrainDataSet(self):
106 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True)
107 |
108 | def getEvalDataSet(self):
109 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True)
110 |
111 | def getSmallTrainDataSet(self, len: int):
112 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
113 |
114 | def getSmallEvalDataSet(self, len: int):
115 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
--------------------------------------------------------------------------------
/llmadmin/frontend/leaderboard.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from pymongo import DESCENDING, MongoClient
3 |
4 | from llmadmin.common.constants import COLLECTION_NAME, DB_NAME, G5_COST_PER_S_IN_DOLLARS
5 |
6 |
7 | class Leaderboard:
8 | def __init__(self, url: str, project_name: str):
9 | self.url = url
10 | self.client = MongoClient(url)
11 | self.db = self.client[DB_NAME]
12 | self.coll = self.db[COLLECTION_NAME]
13 | self.project_name = project_name
14 |
15 | def generate_votes_leaderboard(self) -> pd.DataFrame:
16 | pipeline_votes = [
17 | {"$match": {"votes": {"$ne": None}}},
18 | {
19 | "$group": {
20 | "_id": {"llm": "$votes.llm"},
21 | "Votes": {"$sum": "$votes.score"},
22 | }
23 | },
24 | {"$sort": {"count": DESCENDING}},
25 | {
26 | "$project": {
27 | "LLM": "$_id.llm",
28 | "_id": 0,
29 | "Votes": 1,
30 | }
31 | },
32 | ]
33 |
34 | pipeline_contentions = [
35 | {"$match": {"votes": {"$ne": None}}},
36 | {"$unwind": {"path": "$responses"}},
37 | {
38 | "$group": {
39 | "_id": {"llm": "$responses.model_id"},
40 | "In Contention": {"$sum": 1.0},
41 | }
42 | },
43 | {
44 | "$project": {
45 | "LLM": "$_id.llm",
46 | "_id": 0,
47 | "In Contention": 1,
48 | }
49 | },
50 | ]
51 |
52 | df_contentions = pd.DataFrame(
53 | list(self.coll.aggregate(pipeline_contentions)),
54 | columns=["LLM", "In Contention"],
55 | )
56 | df_votes = pd.DataFrame(
57 | list(self.coll.aggregate(pipeline_votes)), columns=["LLM", "Votes"]
58 | )
59 | df = pd.merge(df_votes, df_contentions, on="LLM", how="right").fillna(0)
60 | # Use m-estimate correction with prior of 1/3
61 | df["Win Ratio"] = (df["Votes"] + 1) / (df["In Contention"] + 3) * 3 * 1000
62 | df["Win Ratio"] = df["Win Ratio"].astype(int)
63 | df = df.sort_values(by="Win Ratio", ascending=False)
64 | return df
65 |
66 | def generate_perf_leaderboard(self) -> pd.DataFrame:
67 | pipeline = [
68 | {"$match": {"votes": {"$ne": None}}},
69 | {"$unwind": {"path": "$responses"}},
70 | {"$match": {"responses": {"$ne": None}}},
71 | {
72 | "$group": {
73 | "_id": {"llm": "$responses.model_id"},
74 | "avg_latency": {"$avg": "$responses.gen_stats.total_time"},
75 | "avg_length": {"$avg": "$responses.gen_stats.num_total_tokens"},
76 | }
77 | },
78 | {
79 | "$project": {
80 | "LLM": "$_id.llm",
81 | "_id": 0,
82 | "Lat (s)": "$avg_latency",
83 | "Tokens (i/o)": "$avg_length",
84 | }
85 | },
86 | ]
87 |
88 | df = pd.DataFrame(
89 | list(self.coll.aggregate(pipeline)),
90 | columns=["LLM", "Lat (s)", "Tokens (i/o)"],
91 | )
92 | print(f"Raw DF \n{df}")
93 | df["Tokens/s"] = df["Tokens (i/o)"] / df["Lat (s)"]
94 | df["Cost per answer"] = df["Lat (s)"] * G5_COST_PER_S_IN_DOLLARS
95 | df["CP 1k tokens $"] = 1000 / df["Tokens/s"] * G5_COST_PER_S_IN_DOLLARS
96 | df = df.sort_values(by="Tokens/s", ascending=False)
97 | df = df.round(
98 | {
99 | "Lat (s)": 1,
100 | "Tokens (i/o)": 1,
101 | "Tokens/s": 1,
102 | "Cost per answer": 4,
103 | "CP 1k tokens $": 4,
104 | }
105 | )
106 | print(df)
107 | return df
108 |
109 |
110 | class DummyLeaderboard(Leaderboard):
111 | def __init__(self, url: str = None, project_name: str = None):
112 | pass
113 |
114 | def generate_votes_leaderboard(self) -> pd.DataFrame:
115 | return pd.DataFrame(
116 | columns=["LLM", "In Contention", "Win Ratio"],
117 | )
118 |
119 | def generate_perf_leaderboard(self) -> pd.DataFrame:
120 | return pd.DataFrame(
121 | columns=[
122 | "LLM",
123 | "Lat (s)",
124 | "Tokens (i/o)",
125 | "Tokens/s",
126 | "Cost per answer",
127 | "CP 1k tokens $",
128 | ]
129 | )
130 |
--------------------------------------------------------------------------------
/llmadmin/api/sdk.py:
--------------------------------------------------------------------------------
1 | # from typing import Any, Dict, List
2 | from llmadmin.api.env import assert_has_backend
3 | from ray.serve._private.constants import DEFAULT_HTTP_PORT
4 | from llmadmin.backend.server import run
5 |
6 |
7 | # __all__ = ["models", "metadata", "run"]
8 |
9 | def start_apiserver(port: int = DEFAULT_HTTP_PORT, resource_config: str = None, scale_config: str = None) -> None:
10 | """Run Api server on the local ray cluster
11 |
12 | NOTE: This only works if you are running this command
13 | on the Ray or Anyscale cluster directly. It does not
14 | work from a general machine which only has the url and token
15 | for a model.
16 | """
17 | assert_has_backend()
18 | run.start_apiserver(port=port, resource_config=resource_config, scale_config=scale_config)
19 |
20 | def run_ft(ft: str) -> None:
21 | """Run LLMAdmin on the local ray cluster
22 |
23 | NOTE: This only works if you are running this command
24 | on the Ray or Anyscale cluster directly. It does not
25 | work from a general machine which only has the url and token
26 | for a model.
27 | """
28 | assert_has_backend()
29 | run.run_ft(ft)
30 |
31 | def run_ray_ft(ft: str) -> None:
32 | """Run LLMAdmin on the local ray cluster
33 |
34 | NOTE: This only works if you are running this command
35 | on the Ray or Anyscale cluster directly. It does not
36 | work from a general machine which only has the url and token
37 | for a model.
38 | """
39 | assert_has_backend()
40 | run.run_ray_ft(ft)
41 |
42 | # def models() -> List[str]:
43 | # """List available models"""
44 | # from llmadmin.common.backend import get_llmadmin_backend
45 |
46 | # backend = get_llmadmin_backend()
47 | # return backend.models()
48 |
49 | # def _is_llmadmin_model(model: str) -> bool:
50 | # """
51 | # Determine if this is an llmadmin model. LLMAdmin
52 | # models do not have a '://' in them.
53 | # """
54 | # return "://" not in model
55 |
56 | # def _supports_batching(model: str) -> bool:
57 | # provider, _ = model.split("://", 1)
58 | # return provider != "openai"
59 |
60 | # def _convert_to_llmadmin_format(model: str, llm_result):
61 | # generation = llm_result.generations
62 | # result_list = [{"generated_text": x.text} for x in generation[0]]
63 | # return result_list
64 |
65 | # def metadata(model_id: str) -> Dict[str, Dict[str, Any]]:
66 | # """Get model metadata"""
67 | # from llmadmin.common.backend import get_llmadmin_backend
68 |
69 | # backend = get_llmadmin_backend()
70 | # return backend.metadata(model_id)
71 |
72 | # def run(*model: str) -> None:
73 | # """Run LLMAdmin on the local ray cluster
74 |
75 | # NOTE: This only works if you are running this command
76 | # on the Ray or Anyscale cluster directly. It does not
77 | # work from a general machine which only has the url and token
78 | # for a model.
79 | # """
80 | # assert_has_backend()
81 | # from llmadmin.backend.server.run import run
82 | # run(*model)
83 |
84 | # def run_experimental(*model: str) -> None:
85 | # """Run LLMAdmin on the local ray cluster
86 |
87 | # NOTE: This only works if you are running this command
88 | # on the Ray or Anyscale cluster directly. It does not
89 | # work from a general machine which only has the url and token
90 | # for a model.
91 | # """
92 | # assert_has_backend()
93 | # from llmadmin.backend.server.run import run_experimental
94 |
95 | # run_experimental(*model)
96 |
97 | # def del_experimental(app_name: str) -> None:
98 | # """Delete ray serve on the local ray cluster
99 |
100 | # NOTE: This only works if you are running this command
101 | # on the Ray or Anyscale cluster directly. It does not
102 | # work from a general machine which only has the url and token
103 | # for a model.
104 | # """
105 | # assert_has_backend()
106 | # from llmadmin.backend.server.run import del_experimental
107 |
108 | # del_experimental(app_name)
109 |
110 | # def run_application(flow: dict) -> None:
111 | # """Run LLMAdmin on the local ray cluster
112 |
113 | # NOTE: This only works if you are running this command
114 | # on the Ray or Anyscale cluster directly. It does not
115 | # work from a general machine which only has the url and token
116 | # for a model.
117 | # """
118 | # assert_has_backend()
119 | # from llmadmin.backend.server.run import run_application
120 |
121 | # run_application(flow)
122 |
123 |
124 | # def run_comparation() -> None:
125 | # """Run LLMAdmin on the local ray cluster
126 |
127 | # NOTE: This only works if you are running this command
128 | # on the Ray or Anyscale cluster directly. It does not
129 | # work from a general machine which only has the url and token
130 | # for a model.
131 | # """
132 | # assert_has_backend()
133 | # from llmadmin.backend.server.run import run_comparation
134 |
135 | # run_comparation()
136 |
137 |
138 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/tokenclassification_conll2003.py:
--------------------------------------------------------------------------------
1 | from ._base import Task
2 | from transformers import AutoModelForTokenClassification
3 | from typing import Any
4 | import pandas as pd
5 | import evaluate
6 | import numpy as np
7 | from transformers import DataCollatorForTokenClassification
8 |
9 | class TokenclassificationConll2003(Task):
10 | AUTO_MODEL_CLASS = AutoModelForTokenClassification
11 |
12 | DATASET_PATH = "conll2003"
13 | FROM_PRETRAINED_KWARGS = {
14 | "num_labels": 9
15 | }
16 |
17 | def _pre(self) -> Any:
18 | label_names = self.get_dataset()[self.training_key()].features["ner_tags"].feature.names
19 | id2label = {i: label for i, label in enumerate(label_names)}
20 | label2id = {v: k for k, v in id2label.items()}
21 | self.FROM_PRETRAINED_KWARGS["id2label"] = id2label
22 | self.FROM_PRETRAINED_KWARGS["label2id"] = label2id
23 |
24 | def get_data_proprocess(self) -> Any:
25 | tokenizer = self.tokenizer
26 | def align_labels_with_tokens(labels, word_ids):
27 | new_labels = []
28 | current_word = None
29 | for word_id in word_ids:
30 | if word_id != current_word:
31 | # Start of a new word!
32 | current_word = word_id
33 | label = -100 if word_id is None else labels[word_id]
34 | new_labels.append(label)
35 | elif word_id is None:
36 | # Special token
37 | new_labels.append(-100)
38 | else:
39 | # Same word as previous token
40 | label = labels[word_id]
41 | # If the label is B-XXX we change it to I-XXX
42 | if label % 2 == 1:
43 | label += 1
44 | new_labels.append(label)
45 |
46 | return new_labels
47 |
48 | # adopt python decorator TODO
49 | def preprocess_function(examples: pd.DataFrame):
50 | # examples = examples.to_dict("list")
51 | # inputs = [i.tolist() for i in examples["tokens"]]
52 | inputs = [i for i in examples["tokens"]]
53 | tokenized_inputs = tokenizer(
54 | inputs, truncation=True, is_split_into_words=True
55 | )
56 | all_labels = examples["ner_tags"]
57 | new_labels = []
58 | for i, labels in enumerate(all_labels):
59 | word_ids = tokenized_inputs.word_ids(i)
60 | new_labels.append(align_labels_with_tokens(labels, word_ids))
61 |
62 | tokenized_inputs["labels"] = new_labels
63 |
64 | # Add back the original columns
65 | ret = {**examples, **tokenized_inputs}
66 | return pd.DataFrame.from_dict(ret)
67 |
68 | return preprocess_function
69 |
70 | def get_data_collator(self) -> Any:
71 | data_collator = DataCollatorForTokenClassification(tokenizer=self.tokenizer)
72 | return data_collator
73 |
74 | def get_compute_metrics(self) -> Any:
75 | label_names = self.get_dataset()[self.training_key()].features["ner_tags"].feature.names
76 | metric = evaluate.load("seqeval")
77 |
78 | def compute_metrics(eval_preds):
79 | logits, labels = eval_preds
80 | predictions = np.argmax(logits, axis=-1)
81 |
82 | # Remove ignored index (special tokens) and convert to labels
83 | true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
84 | true_predictions = [
85 | [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
86 | for prediction, label in zip(predictions, labels)
87 | ]
88 | all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
89 | return {
90 | "precision": all_metrics["overall_precision"],
91 | "recall": all_metrics["overall_recall"],
92 | "f1": all_metrics["overall_f1"],
93 | "accuracy": all_metrics["overall_accuracy"],
94 | }
95 |
96 | return compute_metrics
97 |
98 | def training_key(self):
99 | """
100 | :return: Iterable[obj]
101 | A iterable of any object, that doc_to_text can handle
102 | """
103 | return "train"
104 |
105 | def validation_key(self):
106 | """
107 | :return: Iterable[obj]
108 | A iterable of any object, that doc_to_text can handle
109 | """
110 | return "validation"
111 |
112 | def getTrainDataSet(self):
113 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True)
114 |
115 | def getEvalDataSet(self):
116 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True)
117 |
118 | def getSmallTrainDataSet(self, len: int):
119 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
120 |
121 | def getSmallEvalDataSet(self, len: int):
122 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
123 |
--------------------------------------------------------------------------------
/llmadmin/backend/server/config.py:
--------------------------------------------------------------------------------
1 | from llmadmin.backend.server.models import LLMApp
2 |
3 |
4 | LLMTEMPLATE_DEPLOYMENT_CONFIG = {
5 | "autoscaling_config":{
6 | "min_replicas": 0,
7 | "initial_replicas": 1,
8 | "max_replicas": 8,
9 | "target_num_ongoing_requests_per_replica": 1.0,
10 | "metrics_interval_s": 10.0,
11 | "look_back_period_s": 30.0,
12 | "smoothing_factor": 1.0,
13 | "downscale_delay_s": 300.0,
14 | "upscale_delay_s": 90.0,
15 | },
16 | "ray_actor_options": {
17 | "num_cpus": 0.1
18 | }
19 | }
20 | LLMTEMPLATE_MODEL_CONFIG_COMPARATION = {
21 | "warmup": True,
22 | "model_task": "text-generation",
23 | "model_id": "template",
24 | "max_input_words": 800,
25 | "initialization": {
26 | "runtime_env": {
27 | "pip": ["deepspeed==0.9.2","accelerate"]
28 | },
29 | "initializer":{
30 | "type": "SingleDevice",
31 | "dtype": "float32",
32 | "from_pretrained_kwargs":{
33 | "use_cache": True ,
34 | "trust_remote_code": True
35 | }
36 |
37 | },
38 | "pipeline": "default"
39 | },
40 | "generation":{
41 | "max_batch_size": 18,
42 | "generate_kwargs":{
43 | "do_sample": True,
44 | "max_new_tokens": 128,
45 | "min_new_tokens": 16,
46 | "temperature": 0.7,
47 | "repetition_penalty": 1.1,
48 | "top_p": 0.8,
49 | "top_k": 50,
50 | },
51 | "prompt_format": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\n{instruction}\n### Response:\n",
52 | "stopping_sequences": ["### Response:", "### End"]
53 | }
54 | }
55 |
56 | # TODO defaulttransformers leverage transformer pipeline to load the model, it's a problem, since some model cannot load by pipeline
57 | LLMTEMPLATE_MODEL_CONFIG_EXPERIMENTAL = {
58 | "warmup": True,
59 | "model_task": "text-generation",
60 | "model_id": "template",
61 | "max_input_words": 800,
62 | "initialization": {
63 | "runtime_env": {
64 | "pip": ["deepspeed==0.9.2","accelerate"]
65 | },
66 | "initializer":{
67 | "type": "TransformersPipeline",
68 | "dtype": "float32",
69 | "use_fast": False,
70 | "from_pretrained_kwargs":{
71 | "use_cache": True ,
72 | "trust_remote_code": True
73 | }
74 |
75 | },
76 | "pipeline": "defaulttransformers"
77 | },
78 | "generation":{
79 | "max_batch_size": 18,
80 | "generate_kwargs":{
81 | "do_sample": True,
82 | "max_new_tokens": 128,
83 | "min_new_tokens": 16,
84 | "temperature": 0.7,
85 | "repetition_penalty": 1.1,
86 | "top_p": 0.8,
87 | "top_k": 50,
88 | },
89 | "prompt_format": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\n{instruction}\n### Response:\n",
90 | "stopping_sequences": ["### Response:", "### End"]
91 | }
92 | }
93 |
94 |
95 | LLMTEMPLATE_SCALE_CONFIG = {
96 | "num_workers":1,
97 | "num_gpus_per_worker":0.0,
98 | "num_cpus_per_worker":1.0,
99 | "placement_strategy":'PACK',
100 | "resources_per_worker":None,
101 | "pg_timeout_s":600
102 | }
103 | EXPERIMENTAL_LLMTEMPLATE = LLMApp(scaling_config=LLMTEMPLATE_SCALE_CONFIG.copy(),model_config=LLMTEMPLATE_MODEL_CONFIG_EXPERIMENTAL.copy())
104 | EXPERIMENTAL_LLMTEMPLATE.deployment_config = LLMTEMPLATE_DEPLOYMENT_CONFIG.copy()
105 |
106 | COMPARATION_LLMTEMPLATE = LLMApp(scaling_config=LLMTEMPLATE_SCALE_CONFIG.copy(),model_config=LLMTEMPLATE_MODEL_CONFIG_COMPARATION.copy())
107 | COMPARATION_LLMTEMPLATE.deployment_config = LLMTEMPLATE_DEPLOYMENT_CONFIG.copy()
108 |
109 | RAY_AGENT_ADDRESS = "http://localhost:52365"
110 |
111 | MODELS_MAPPING = {
112 | "gpt2": "./models/text-generation--gpt2.yaml",
113 | "t5-small": "./models/translation--t5-small.yaml",
114 | "THUDM/chatglm2-6b": "./models/text-generation--THUDM-chatglm2-6b.yaml",
115 | "THUDM/chatglm-6b": "./models/text-generation--THUDM-chatglm-6b.yaml",
116 | "Qwen/Qwen-7B": "./models/text-generation--Qwen--Qwen-7B.yaml",
117 | "Qwen/Qwen-7B-Chat": "./models/text-generation--Qwen--Qwen-7B-Chat.yaml",
118 | "LinkSoul/Chinese-Llama-2-7b": "./models/text-generation--LinkSoul--Chinese-Llama-2-7b.yaml",
119 | "bigscience/bloom-560m": "./models/text-generation--bigscience--bloom-560m.yaml",
120 | "baichuan-inc/Baichuan-7B": "./models/text-generation--baichuan-inc--Baichuan-7B.yaml",
121 | "distilbert-base-uncased-finetuned-sst-2-english": "./models/text-classification--distilbert-base-uncased-finetuned-sst-2-english.yaml",
122 | "facebook/bart-large-cnn": "./models/summarization--facebook--bart-large-cnn.yaml",
123 | "deepset/roberta-base-squad2": "./models/question-answering--deepset--roberta-base-squad2.yaml",
124 | "nlpconnect/vit-gpt2-image-captioning": "./models/image-to-text--nlpconnect--vit-gpt2-image-captioning.yaml"
125 | }
126 |
127 | URL = "http://127.0.0.1:8000/"
128 | SERVE_RUN_HOST = "0.0.0.0"
--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/default_pipeline.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import List, Optional, Union
3 |
4 | import torch
5 | from transformers import PreTrainedModel, PreTrainedTokenizer
6 |
7 | from llmadmin.backend.logger import get_logger
8 | from llmadmin.backend.server.models import Response
9 |
10 | from ._base import BasePipeline
11 | from .processors import StopOnTokens
12 | from .utils import construct_prompts, truncate_to_first_stop_token
13 |
14 | logger = get_logger(__name__)
15 |
16 |
17 | class DefaultPipeline(BasePipeline):
18 | """Default text generation pipeline.
19 |
20 | Args:
21 | model (PreTrainedModel): Hugging Face model.
22 | tokenizer (PreTrainedTokenizer): Hugging Face tokenizer.
23 | prompt_format (Optional[str], optional): Prompt format. Defaults to None.
24 | device (Optional[Union[str, int, torch.device]], optional): Device to place model on. Defaults to model's
25 | device.
26 | """
27 |
28 | def __init__(
29 | self,
30 | model: PreTrainedModel,
31 | tokenizer: PreTrainedTokenizer,
32 | prompt_format: Optional[str] = None,
33 | device: Optional[Union[str, int, torch.device]] = None,
34 | ) -> None:
35 | super().__init__(
36 | model=model,
37 | tokenizer=tokenizer,
38 | prompt_format=prompt_format,
39 | device=device,
40 | )
41 |
42 | def preprocess(self, prompts: List[str], **generate_kwargs):
43 | st = time.monotonic()
44 | prompt_text = construct_prompts(prompts, prompt_format=self.prompt_format)
45 | instruction_text = construct_prompts(prompts, prompt_format="")
46 | if self.tokenizer.pad_token is None:
47 | self.tokenizer.pad_token = self.tokenizer.eos_token
48 |
49 | inputs = self.tokenizer(
50 | prompt_text, return_tensors="pt", padding=True, **generate_kwargs
51 | ).to(self.model.device)
52 | if not generate_kwargs.get("return_token_type_ids", True):
53 | inputs.pop("token_type_ids", None)
54 | et = time.monotonic() - st
55 | return {
56 | "inputs": inputs,
57 | "instruction_text": instruction_text,
58 | "prompt_text": prompt_text,
59 | "preprocessing_time": et,
60 | }
61 |
62 | def forward(self, model_inputs, **generate_kwargs):
63 | st = time.monotonic()
64 | inputs = model_inputs["inputs"]
65 | instruction_text = model_inputs["instruction_text"]
66 | prompt_text = model_inputs["prompt_text"]
67 | preprocessing_time = model_inputs["preprocessing_time"]
68 | generated_sequence = self.model.generate(
69 | **{
70 | **inputs,
71 | **generate_kwargs,
72 | }
73 | )
74 | et = time.monotonic() - st
75 | return {
76 | "inputs": inputs,
77 | "generated_sequence": generated_sequence,
78 | "instruction_text": instruction_text,
79 | "prompt_text": prompt_text,
80 | "preprocessing_time": preprocessing_time,
81 | "generation_time": et,
82 | "generate_kwargs": generate_kwargs,
83 | }
84 |
85 | def postprocess(self, model_outputs, **postprocess_kwargs) -> List[Response]:
86 | st = time.monotonic()
87 | tokens = model_outputs["generated_sequence"]
88 | input_ids = model_outputs["inputs"]["input_ids"]
89 | token_stopper = next(
90 | (
91 | x
92 | for x in model_outputs["generate_kwargs"].get("stopping_criteria", [])
93 | if isinstance(x, StopOnTokens)
94 | ),
95 | None,
96 | )
97 | decoded: List[Response] = []
98 | num_generated_tokens_batch = 0
99 | num_input_tokens_batch = 0
100 | for token_unwrapped, inputs_unwrapped in zip(tokens, input_ids):
101 | logger.info(
102 | f"Unprocessed generated tokens: '{self.tokenizer.decode(token_unwrapped, skip_special_tokens=False).encode('unicode_escape').decode('utf-8')}'"
103 | )
104 | tokens = token_unwrapped[len(inputs_unwrapped) :]
105 | if token_stopper:
106 | tokens = truncate_to_first_stop_token(
107 | tokens, token_stopper.stopping_sequences
108 | )
109 | text = (
110 | self.tokenizer.decode(tokens, skip_special_tokens=True)
111 | .replace("\u200b", "")
112 | .strip()
113 | )
114 | for i in range(len(inputs_unwrapped)):
115 | if inputs_unwrapped[i] != self.tokenizer.pad_token_id:
116 | break
117 | num_input_tokens = len(inputs_unwrapped[i:])
118 | num_generated_tokens = len(tokens)
119 | response = Response(
120 | generated_text=text,
121 | num_generated_tokens=num_generated_tokens,
122 | num_input_tokens=num_input_tokens,
123 | )
124 | num_generated_tokens_batch += num_generated_tokens
125 | num_input_tokens_batch += num_input_tokens
126 | decoded.append(response)
127 | et = time.monotonic() - st
128 | for response in decoded:
129 | response.num_generated_tokens_batch = num_generated_tokens_batch
130 | response.num_input_tokens_batch = num_input_tokens_batch
131 | response.preprocessing_time = model_outputs["preprocessing_time"]
132 | response.generation_time = model_outputs["generation_time"]
133 | response.postprocessing_time = et
134 | return decoded
135 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/default_transformers_pipeline.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING, List, Optional, Union
2 |
3 | import torch
4 | from transformers import Pipeline as TransformersPipeline
5 | from transformers import PreTrainedModel, PreTrainedTokenizer, pipeline
6 |
7 | from llmadmin.backend.logger import get_logger
8 | from llmadmin.backend.server.models import Prompt, Response
9 |
10 | from ._base import BasePipeline
11 | from .utils import construct_prompts, construct_prompts_experimental
12 | # from llmadmin.backend.server.utils import render_gradio_params
13 | from .default_pipeline import DefaultPipeline
14 |
15 | try:
16 | import transformers
17 | from transformers import pipelines
18 | except ImportError as ie:
19 | raise ImportError(
20 | "transformers not installed. Please try `pip install transformers`"
21 | ) from ie
22 |
23 | if TYPE_CHECKING:
24 | from ..initializers._base import LLMInitializer
25 |
26 | logger = get_logger(__name__)
27 |
28 |
29 | class DefaultTransformersPipeline(BasePipeline):
30 | """Text generation pipeline using Transformers Pipeline.
31 |
32 | May not support all features.
33 |
34 | Args:
35 | model (PreTrainedModel): Hugging Face model.
36 | tokenizer (PreTrainedTokenizer): Hugging Face tokenizer.
37 | prompt_format (Optional[str], optional): Prompt format. Defaults to None.
38 | device (Optional[Union[str, int, torch.device]], optional): Device to place model on. Defaults to model's
39 | device.
40 | """
41 |
42 | def __init__(
43 | self,
44 | model: PreTrainedModel,
45 | tokenizer: PreTrainedTokenizer,
46 | prompt_format: Optional[str] = None,
47 | device: Optional[Union[str, int, torch.device]] = None,
48 | task: str = None,
49 | ) -> None:
50 | if not hasattr(model, "generate"):
51 | raise ValueError("Model must have a generate method.")
52 | super().__init__(model, tokenizer, prompt_format, device)
53 |
54 | self.pipeline = None
55 | self.preprocess = None
56 | self.postprocess = None
57 |
58 | def _get_transformers_pipeline(self, **kwargs) -> TransformersPipeline:
59 | default_kwargs = dict(
60 | task="text-generation",
61 | model=self.model,
62 | tokenizer=self.tokenizer,
63 | device=None,
64 | )
65 | transformers_pipe = pipeline(**{**default_kwargs, **kwargs})
66 | transformers_pipe.device = self.device
67 | return transformers_pipe
68 |
69 | @torch.inference_mode()
70 | def __call__(self, inputs: List[Union[str, Prompt]], **kwargs) -> List[Response]:
71 | if not self.pipeline:
72 | self.pipeline = self._get_transformers_pipeline()
73 |
74 | logger.info(f"input from pipeline: ****** {inputs}")
75 | inputs = construct_prompts_experimental(
76 | inputs, prompt_format=self.prompt_format)
77 |
78 | logger.info(f"input from pipeline: ****** {inputs}")
79 |
80 | if self.preprocess:
81 | data = self.preprocess(inputs)
82 |
83 | logger.info(data)
84 | kwargs.pop("stopping_sequences", None)
85 | kwargs.pop("timeout_s", None)
86 | kwargs.pop("start_timestamp", None)
87 | # special cases that needs to be handled differently
88 | if isinstance(
89 | self.pipeline,
90 | (
91 | pipelines.text_classification.TextClassificationPipeline,
92 | pipelines.text2text_generation.Text2TextGenerationPipeline,
93 | pipelines.text2text_generation.TranslationPipeline,
94 | ),
95 | ):
96 | data = self.pipeline(*data, **kwargs)
97 | else:
98 | data = self.pipeline(**data, **kwargs)
99 |
100 | logger.info(f"output from pipeline: ****** {data}")
101 | if self.postprocess:
102 | output = self.postprocess(data)
103 |
104 | return output
105 |
106 | @classmethod
107 | def from_initializer(
108 | cls,
109 | initializer: "LLMInitializer",
110 | model_id: str,
111 | prompt_format: Optional[str] = None,
112 | device: Optional[Union[str, int, torch.device]] = None,
113 | stopping_sequences: List[Union[int, str]] = None,
114 | **kwargs,
115 | ) -> "DefaultTransformersPipeline":
116 | model_from_pretrained_kwargs = initializer.get_model_from_pretrained_kwargs()
117 | default_kwargs = dict(
118 | model=model_id,
119 | **kwargs,
120 | **model_from_pretrained_kwargs
121 | )
122 |
123 | transformers_pipe = pipeline(
124 | **default_kwargs,
125 | model_kwargs=initializer.get_model_init_kwargs(),
126 | )
127 | # transformers_pipe.model = initializer.postprocess_model(transformers_pipe.model)
128 | pipe = cls(
129 | model=transformers_pipe.model,
130 | tokenizer=transformers_pipe.tokenizer,
131 | prompt_format=prompt_format,
132 | device=device,
133 | # stopping_sequences=stopping_sequences,
134 | **kwargs,
135 | )
136 | pipe.pipeline = transformers_pipe
137 | transformers_pipe.device = pipe.device
138 |
139 | # if "task" in kwargs:
140 | # pipeline_info = render_gradio_params(kwargs["task"])
141 | # pipe.preprocess = pipeline_info["preprocess"]
142 | # pipe.postprocess = pipeline_info["postprocess"]
143 |
144 | return pipe
145 |
146 | def preprocess(self, prompts: List[str], **generate_kwargs):
147 | pass
148 |
149 | def forward(self, model_inputs, **generate_kwargs):
150 | pass
151 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/noheader_AdvertiseGen.py:
--------------------------------------------------------------------------------
1 | from ._base import Task
2 | from transformers import AutoModel, DataCollatorForSeq2Seq
3 | from typing import Any
4 | import pandas as pd
5 | import numpy as np
6 | import jieba
7 | from rouge_chinese import Rouge
8 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
9 |
10 | class NoheaderAdvertiseGen(Task):
11 | AUTO_MODEL_CLASS = AutoModel
12 |
13 | DATASET_PATH = "AdvertiseGen"
14 |
15 | def get_data_proprocess(self) -> Any:
16 | tokenizer = self.tokenizer
17 | max_length = self.ft_config.train_config.base_config.max_length
18 | # adopt python decorator TODO
19 | def preprocess_function(examples: pd.DataFrame):
20 | # examples = examples.to_dict("list")
21 | #-- start
22 | max_source_length = int(max_length / 2)
23 | max_target_length = max_length - max_source_length
24 | # max_seq_length = data_args.max_source_length + data_args.max_target_length
25 |
26 | model_inputs = {
27 | "input_ids": [],
28 | "labels": [],
29 | }
30 | for i in range(len(examples["content"])):
31 | if examples["content"][i] and examples["summary"][i]:
32 | prompt, answer = examples["content"][i], examples["summary"][i]
33 |
34 | a_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
35 | b_ids = tokenizer.encode(text=answer, add_special_tokens=False)
36 |
37 | if len(a_ids) > max_source_length - 1:
38 | a_ids = a_ids[: max_source_length - 1]
39 |
40 | if len(b_ids) > max_target_length - 2:
41 | b_ids = b_ids[: max_target_length - 2]
42 |
43 | input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)
44 |
45 | context_length = input_ids.index(tokenizer.bos_token_id)
46 | mask_position = context_length - 1
47 | labels = [-100] * context_length + input_ids[mask_position+1:]
48 |
49 | # pad_len = max_length - len(input_ids)
50 | # input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
51 | # labels = labels + [tokenizer.pad_token_id] * pad_len
52 | # if data_args.ignore_pad_token_for_loss:
53 | # labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
54 |
55 | model_inputs["input_ids"].append(input_ids)
56 | model_inputs["labels"].append(labels)
57 |
58 |
59 | # Add back the original columns
60 | ret = {**examples, **model_inputs}
61 | return pd.DataFrame.from_dict(ret)
62 |
63 | return preprocess_function
64 |
65 | def get_compute_metrics(self) -> Any:
66 | tokenizer = self.tokenizer
67 |
68 | def compute_metrics(eval_preds):
69 | preds, labels = eval_preds
70 | if isinstance(preds, tuple):
71 | preds = preds[0]
72 | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
73 |
74 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
75 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
76 |
77 | score_dict = {
78 | "rouge-1": [],
79 | "rouge-2": [],
80 | "rouge-l": [],
81 | "bleu-4": []
82 | }
83 | for pred, label in zip(decoded_preds, decoded_labels):
84 | hypothesis = list(jieba.cut(pred))
85 | reference = list(jieba.cut(label))
86 | rouge = Rouge()
87 | scores = rouge.get_scores(' '.join(hypothesis) , ' '.join(reference))
88 | result = scores[0]
89 |
90 | for k, v in result.items():
91 | score_dict[k].append(round(v["f"] * 100, 4))
92 | bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
93 | score_dict["bleu-4"].append(round(bleu_score * 100, 4))
94 |
95 | for k, v in score_dict.items():
96 | score_dict[k] = float(np.mean(v))
97 | return score_dict
98 |
99 | return compute_metrics
100 |
101 | def get_data_collator(self) -> Any:
102 | data_collator = DataCollatorForSeq2Seq(
103 | tokenizer=self.tokenizer,
104 | model=self.model,
105 | label_pad_token_id=-100,
106 | pad_to_multiple_of=None,
107 | padding=True
108 | )
109 | return data_collator
110 |
111 | def training_key(self):
112 | """
113 | :return: Iterable[obj]
114 | A iterable of any object, that doc_to_text can handle
115 | """
116 | return "train"
117 |
118 | def validation_key(self):
119 | """
120 | :return: Iterable[obj]
121 | A iterable of any object, that doc_to_text can handle
122 | """
123 | return "validation"
124 |
125 | def getTrainDataSet(self):
126 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True)
127 |
128 | def getEvalDataSet(self):
129 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True)
130 |
131 | def getSmallTrainDataSet(self, len: int):
132 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
133 |
134 | def getSmallEvalDataSet(self, len: int):
135 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
136 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # LLM - Finetune
3 |
4 | The framework of training large language models,support lora, full parameters fine tune etc, define yaml to start training/fine tune of your defined models, data and methods. Easy define and easy start. A large-scale model training framework that supports tasks such as LoRA and full-parameter fine-tuning. Easily initiate your large model training and fine-tuning work by defining a YAML file specifying the base model, dataset, and training parameters. Feedback and stars⭐️ are welcome!
5 |
6 |
7 |
8 | Two steps to run your LLM finetune:
9 |
10 | ## 1. Easy Install
11 |
12 | ### Installation
13 |
14 | Use SHELL `bash` for command
15 |
16 | ```bash
17 | git clone https://github.com/OpenCSGs/llm-finetune.git
18 | cd llm-finetune
19 | pip install . # Install from CN: 'pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple'
20 | ```
21 |
22 | ## 2. Easy Run
23 | ### Finetune model by command
24 |
25 | ```
26 | llmfinetune run-ft --ft=./models/ft--sequenceclassification--bert-base-uncased-lora.yaml
27 | ```
28 |
29 | Your Finetune task is starting now.
30 |
31 | - You can add more yaml for your own to define your tasks.
32 |
33 | *******
34 | ## (Optional) Addtional launcher: Launch by accelerate or deepspeed
35 | Ensure accelerate and deepspeed are installed, then follow below steps.
36 |
37 | ### Launch by accelerate
38 |
39 | Modify parameters of `accelerate launch` for distributed train.
40 |
41 | #### Finetune on CPU
42 |
43 | ```
44 | # Use CPU
45 | accelerate launch --cpu --num_machines=1 --num_processes=1 --num_cpu_threads_per_process=1 --mixed_precision=no --dynamo_backend=no llm_finetune.py run-ft --ft=/Users/hub/code/jihulab/opencsg/llm-inference/models/ft--sequenceclassification--bert-base-cased.yaml
46 | ```
47 |
48 | #### Finetune on GPU on single host
49 |
50 | Control GPU Visibility with `CUDA_VISIBLE_DEVICES`.
51 |
52 | ```
53 | # Use GPU:0
54 | CUDA_VISIBLE_DEVICES=0 accelerate launch llmfinetune run-ft --ft=./models/ft--sequenceclassification--bert-base-uncased-lora.yaml
55 |
56 | # Use GPU:1
57 | CUDA_VISIBLE_DEVICES=1 accelerate launch llmfinetune run-ft --ft=./models/ft--sequenceclassification--bert-base-uncased-lora.yaml
58 |
59 | # Use GPU:0
60 | accelerate launch --num_machines=1 --num_processes=1 --gpu_ids=0 llmfinetune ...
61 | ```
62 |
63 | #### Finetune on multi-GPUs on single host
64 |
65 | ```
66 | # Use all GPUs with mixed precision disabled
67 | accelerate launch --multi_gpu llmfinetune ...
68 |
69 | # Use all GPUs with mxied precision
70 | accelerate launch --multi_gpu --mixed_precision=fp16 llmfinetune ...
71 |
72 | # Use GPU:0 and GPU:1
73 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --multi_gpu --gpu_ids=0,1 llmfinetune ...
74 |
75 | # Launch with 2 GPUs
76 | accelerate launch --multi_gpu --num_processes 2 llmfinetune ...
77 | ```
78 |
79 | ```
80 | # Use default_config.yaml
81 | compute_environment: LOCAL_MACHINE
82 | deepspeed_config: {}
83 | distributed_type: MULTI_GPU
84 | downcast_bf16: 'no'
85 | machine_rank: 'NO'
86 | fsdp_config: {}
87 | gpu_ids: all # all GPUs id
88 | machine_rank: 0
89 | main_training_function: main
90 | megatron_lm_config: {}
91 | mixed_precision: fp16 # mixed precsion
92 | num_machines: 1 # a single machine
93 | num_processes: 4 # 4 GPUs
94 | rdzv_backend: static
95 | same_network: true
96 | use_cpu: false
97 |
98 |
99 | accelerate launch --config_file default_config.yaml llmfinetune ...
100 | ```
101 |
102 | #### Finetune on multi-GPUs on multi-hosts
103 |
104 | All hosts need access without password each other
105 |
106 | ```
107 | # default_config.yaml
108 | compute_environment: LOCAL_MACHINE
109 | deepspeed_config:
110 | deepspeed_multinode_launcher: standard
111 | gradient_accumulation_steps: 1
112 | gradient_clipping: 1.0
113 | offload_optimizer_device: none
114 | offload_param_device: none
115 | zero3_init_flag: true
116 | zero3_save_16bit_model: true
117 | zero_stage: 3
118 | distributed_type: DEEPSPEED
119 | downcast_bf16: 'no'
120 | dynamo_config: {}
121 | fsdp_config: {}
122 | main_training_function: main
123 | megatron_lm_config: {}
124 | mixed_precision: fp16
125 | num_machines: 2 # 2 nodes
126 | num_processes: 16 # 16 GPUs of all nodes
127 | tpu_env: []
128 | tpu_use_cluster: false
129 | tpu_use_sudo: false
130 | use_cpu: false
131 |
132 | # Run on all hosts by specify `RANK`, `MASTER_ADDR`, `MASTER_PORT`
133 | accelerate launch --config_file default_config.yaml \
134 | --machine_rank ${RANK} \
135 | --main_process_ip ${MASTER_ADDR} \
136 | --main_process_port ${MASTER_PORT} \
137 | ...
138 |
139 | # --machine_rank: 0 for the main/master node, for other nodes is 1,2,3 etc.
140 | ```
141 |
142 | #### Finetune by Deepspeed for multi-GPUs on multi-hosts
143 |
144 | All hosts need access without password each other
145 |
146 | ```
147 | # myhostfile
148 | node1 slots=1
149 | node2 slots=1
150 |
151 | # deepspeed.json
152 | {
153 | "train_batch_size": "auto",
154 | "train_micro_batch_size_per_gpu": "auto",
155 | "gradient_accumulation_steps": "auto",
156 | "gradient_clipping": "auto",
157 | "zero_allow_untested_optimizer": true,
158 | "fp16": {
159 | "enabled": "auto",
160 | "loss_scale": 0,
161 | "initial_scale_power": 16,
162 | "loss_scale_window": 1000,
163 | "hysteresis": 2,
164 | "min_loss_scale": 1
165 | },
166 | "zero_optimization": {
167 | "stage": 2,
168 | "allgather_partitions": true,
169 | "allgather_bucket_size": 5e8,
170 | "reduce_scatter": true,
171 | "reduce_bucket_size": 5e8,
172 | "overlap_comm": false,
173 | "contiguous_gradients": true
174 | }
175 | }
176 |
177 | deepspeed --num_nodes=2 --hostfile=myhostfile --deepspeed deepspeed.json ...
178 |
179 | # --num_nodes: num of host
180 | # --hostfile: host file include all hosts IP and num of GPUs
181 | # --deepspeed: deepspeed config file
182 |
183 | ```
184 |
185 |
186 |
187 |
--------------------------------------------------------------------------------
/llmadmin/common/backend.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | from abc import ABC, abstractmethod
4 | from typing import Any, Dict, List, Union
5 |
6 | import requests
7 |
8 | from llmadmin.common.constants import TIMEOUT
9 |
10 |
11 | class BackendError(RuntimeError):
12 | def __init__(self, *args: object, **kwargs) -> None:
13 | self.response = kwargs.pop("response", None)
14 | super().__init__(*args)
15 |
16 | logger = logging.getLogger("ray.logger")
17 |
18 | def get_llmadmin_backend(url: str = "http://127.0.0.1:8000/cmp_models_default"):
19 | """
20 | Establishes a connection to the LLMAdmin backed after establishing
21 | the information using environmental variables.
22 | If the AVIARY_MOCK environmental variable is set, then a mock backend is used.
23 |
24 | For direct connection to the llmadmin backend (e.g. running on the same cluster),
25 | no AVIARY_TOKEN is required. Otherwise, the AVIARY_URL and AVIARY_TOKEN environmental variables
26 | are required.
27 |
28 | Returns:
29 | backend: An instance of the Backend class.
30 | """
31 | mock_backend = os.getenv("AVIARY_MOCK", False)
32 | if mock_backend:
33 | backend = MockBackend()
34 | return backend
35 | print(os.getenv("AVIARY_URL"))
36 | llmadmin_url = url
37 | assert llmadmin_url is not None, "AVIARY_URL must be set"
38 | backend_token = os.getenv("AVIARY_TOKEN")
39 | bearer = f"Bearer {backend_token}" if backend_token is not None else ""
40 | if not llmadmin_url.endswith("/"):
41 | llmadmin_url += "/"
42 | print("Connecting to LLMAdmin backend at: ", llmadmin_url)
43 | backend = LLMAdminBackend(llmadmin_url, bearer)
44 | return backend
45 |
46 |
47 | class Backend(ABC):
48 | """Abstract interface for talking to LLMAdmin."""
49 |
50 | @abstractmethod
51 | def models(self) -> List[str]:
52 | pass
53 |
54 | @abstractmethod
55 | def metadata(self, llm: str) -> Dict[str, Dict[str, Any]]:
56 | pass
57 |
58 | @abstractmethod
59 | def completions(self, prompt: str, llm: str) -> Dict[str, Union[str, float, int]]:
60 | pass
61 |
62 | @abstractmethod
63 | def batch_completions(
64 | self, prompts: List[str], llm: str
65 | ) -> List[Dict[str, Union[str, float, int]]]:
66 | pass
67 |
68 |
69 | class LLMAdminBackend(Backend):
70 | """Interface for talking to LLMAdmin.
71 | Deliberately designed to be similar to OpenAI's
72 | Completions interface.
73 |
74 | https://platform.openai.com/docs/api-reference/completions?lang=python
75 | """
76 |
77 | def __init__(self, backend_url: str, bearer: str):
78 | assert "::param" not in backend_url, "backend_url not set correctly"
79 | assert "::param" not in bearer, "bearer not set correctly"
80 |
81 | self.backend_url = backend_url
82 | self.bearer = bearer
83 | self.header = {"Authorization": self.bearer}
84 |
85 | def models(self) -> List[str]:
86 | url = self.backend_url + "models"
87 | print("Connecting backend to get models at: ", url)
88 | response = requests.get(url, headers=self.header, timeout=TIMEOUT)
89 | try:
90 | result = response.json()
91 | except requests.JSONDecodeError as e:
92 | raise BackendError(
93 | f"Error decoding JSON from {url}. Text response: {response.text}",
94 | response=response,
95 | ) from e
96 | return result
97 |
98 | def metadata(self, llm: str) -> Dict[str, Dict[str, Any]]:
99 | url = self.backend_url + "metadata/" + llm.replace("/", "--")
100 | response = requests.get(url, headers=self.header, timeout=TIMEOUT)
101 | try:
102 | result = response.json()
103 | except requests.JSONDecodeError as e:
104 | raise BackendError(
105 | f"Error decoding JSON from {url}. Text response: {response.text}",
106 | response=response,
107 | ) from e
108 | return result
109 |
110 | def completions(self, prompt: str, llm: str) -> Dict[str, Union[str, float, int]]:
111 | url = self.backend_url + "query/" + llm.replace("/", "--")
112 | response = requests.post(
113 | url,
114 | headers=self.header,
115 | json={"prompt": prompt},
116 | timeout=TIMEOUT,
117 | )
118 | try:
119 | return response.json()[llm]
120 | except requests.JSONDecodeError as e:
121 | raise BackendError(
122 | f"Error decoding JSON from {url}. Text response: {response.text}",
123 | response=response,
124 | ) from e
125 |
126 | def batch_completions(
127 | self, prompts: List[str], llm: str
128 | ) -> List[Dict[str, Union[str, float, int]]]:
129 | url = self.backend_url + "query/batch/" + llm.replace("/", "--")
130 | response = requests.post(
131 | url,
132 | headers=self.header,
133 | json=[{"prompt": prompt} for prompt in prompts],
134 | timeout=TIMEOUT,
135 | )
136 | try:
137 | return response.json()[llm]
138 | except requests.JSONDecodeError as e:
139 | raise BackendError(
140 | f"Error decoding JSON from {url}. Text response: {response.text}",
141 | response=response,
142 | ) from e
143 |
144 |
145 | class MockBackend(Backend):
146 | """Mock backend for testing"""
147 |
148 | def __init__(self):
149 | pass
150 |
151 | def models(self) -> List[str]:
152 | return ["A", "B", "C"]
153 |
154 | def metadata(self, llm: str) -> Dict[str, Dict[str, Any]]:
155 | return {
156 | "metadata": {
157 | "model_config": {
158 | "model_id": llm,
159 | "model_url": f"https://huggingface.co/org/{llm}",
160 | "model_description": f"This is a model description for model {llm}",
161 | }
162 | }
163 | }
164 |
165 | def completions(self, prompt: str, llm: str) -> Dict[str, Union[str, float, int]]:
166 | return {
167 | "generated_text": prompt,
168 | "total_time": 99,
169 | "num_total_tokens": 42.3,
170 | }
171 |
172 | def batch_completions(
173 | self, prompts: List[str], llm: str
174 | ) -> List[Dict[str, Union[str, float, int]]]:
175 | return [
176 | {
177 | "generated_text": prompt,
178 | "total_time": 99,
179 | "num_total_tokens": 42.3,
180 | }
181 | for prompt in prompts
182 | ]
183 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/llamacpp/llamacpp_pipeline.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
3 |
4 | import torch
5 |
6 | from llmadmin.backend.logger import get_logger
7 | from llmadmin.backend.server.models import Response
8 |
9 | from ...initializers.llamacpp import LlamaCppInitializer, LlamaCppTokenizer
10 | from .._base import StreamingPipeline
11 | from ..utils import decode_stopping_sequences_where_needed, construct_prompts
12 |
13 | if TYPE_CHECKING:
14 | from llama_cpp import Llama, LogitsProcessorList, StoppingCriteriaList
15 |
16 | logger = get_logger(__name__)
17 |
18 |
19 | class LlamaCppPipeline(StreamingPipeline):
20 | """Text generation pipeline using llama.cpp.
21 |
22 | May not support all features."""
23 |
24 | def __init__(
25 | self,
26 | model: "Llama",
27 | tokenizer: LlamaCppTokenizer,
28 | prompt_format: Optional[str] = None,
29 | device: Optional[Union[str, int, torch.device]] = None,
30 | **kwargs,
31 | ) -> None:
32 | from llama_cpp import Llama
33 |
34 | if not isinstance(model, Llama):
35 | raise TypeError("Model must be an instance of llama_cpp.Llama.")
36 | self.model = model
37 | self.kwargs = kwargs
38 | self.tokenizer = tokenizer
39 | self.device = device
40 | self.prompt_format = prompt_format
41 |
42 | def _get_logits_processors(
43 | self, generate_kwargs: Dict[str, Any], model_inputs=None
44 | ) -> "LogitsProcessorList":
45 | from llama_cpp import LogitsProcessorList
46 |
47 | from llmadmin.backend.llm.pipelines.llamacpp.processors import (
48 | LlamaCppMinNewTokensLengthLogitsProcessor,
49 | )
50 |
51 | lst = []
52 |
53 | if "min_new_tokens" in generate_kwargs:
54 | lst.append(
55 | LlamaCppMinNewTokensLengthLogitsProcessor(
56 | prompt_length_to_skip=len(model_inputs["tokenized_inputs"]),
57 | min_new_tokens=generate_kwargs.pop("min_new_tokens", 4),
58 | eos_token_id=self.model.token_eos(),
59 | )
60 | )
61 |
62 | return LogitsProcessorList(lst)
63 |
64 | def _get_stopping_criteria(
65 | self, generate_kwargs: Dict[str, Any], model_inputs=None
66 | ) -> "StoppingCriteriaList":
67 | from llama_cpp import StoppingCriteriaList
68 |
69 | from llmadmin.backend.llm.pipelines.llamacpp.processors import (
70 | LlamaMaxTimeCriteria,
71 | )
72 |
73 | lst = []
74 |
75 | timeout_s = generate_kwargs.pop("timeout_s", None)
76 | start_timestamp = generate_kwargs.pop("start_timestamp", None)
77 | if timeout_s is not None and start_timestamp is not None:
78 | lst.append(LlamaMaxTimeCriteria(timeout_s, start_timestamp))
79 |
80 | return StoppingCriteriaList(lst)
81 |
82 | def _add_default_generate_kwargs(
83 | self, generate_kwargs: Dict[str, Any], model_inputs=None
84 | ) -> Dict[str, Any]:
85 | generate_kwargs = generate_kwargs.copy()
86 | generate_kwargs.setdefault("echo", False)
87 | stopping_sequences = generate_kwargs.pop("stopping_sequences")
88 | stopping_sequences = decode_stopping_sequences_where_needed(
89 | self.tokenizer, stopping_sequences
90 | )
91 | generate_kwargs.setdefault("stop", stopping_sequences)
92 | generate_kwargs["logits_processor"] = self._get_logits_processors(
93 | generate_kwargs, model_inputs=model_inputs
94 | )
95 | generate_kwargs["stopping_criteria"] = self._get_stopping_criteria(
96 | generate_kwargs, model_inputs=model_inputs
97 | )
98 | return generate_kwargs
99 |
100 | def __call__(self, inputs: List[str], **kwargs) -> List[Response]:
101 | logger.info(inputs)
102 | inputs = construct_prompts(
103 | inputs, prompt_format=self.prompt_format)
104 |
105 | logger.info(inputs)
106 | tokenized_inputs = self.tokenizer.encode(inputs[0])
107 | kwargs = self._add_default_generate_kwargs(
108 | kwargs,
109 | model_inputs={"inputs": inputs, "tokenized_inputs": tokenized_inputs},
110 | )
111 |
112 | logger.info(f"Forward params: {kwargs}, model_inputs {inputs}")
113 | responses = []
114 | for input in inputs:
115 | st = time.monotonic()
116 | output = self.model(input, **kwargs)
117 | gen_time = time.monotonic() - st
118 | text = output["choices"][0]["text"].replace("\u200b", "").strip()
119 | responses.append(
120 | Response(
121 | generated_text=text,
122 | num_generated_tokens=output["usage"]["completion_tokens"],
123 | num_input_tokens=output["usage"]["prompt_tokens"],
124 | num_generated_tokens_batch=output["usage"]["completion_tokens"],
125 | num_input_tokens_batch=output["usage"]["prompt_tokens"],
126 | preprocessing_time=None,
127 | postprocessing_time=None,
128 | generation_time=gen_time,
129 | )
130 | )
131 | return responses
132 |
133 | def stream(
134 | self,
135 | inputs: List[str],
136 | **kwargs,
137 | ) -> Iterator[torch.LongTensor]:
138 | tokenized_inputs = self.tokenizer.encode(inputs[0])
139 | kwargs = self._add_default_generate_kwargs(
140 | kwargs,
141 | model_inputs={"inputs": inputs, "tokenized_inputs": tokenized_inputs},
142 | )
143 |
144 | logger.info(f"Forward params: {kwargs}, model_inputs {inputs}")
145 | first_token_done = False
146 | for input in inputs:
147 | for output in self.model(input, stream=True, **kwargs):
148 | st = time.monotonic()
149 | gen_time = time.monotonic() - st
150 | text = output["choices"][0]["text"].replace("\u200b", "")
151 | if not first_token_done:
152 | text = text.lstrip()
153 | first_token_done = True
154 | yield [
155 | Response(
156 | generated_text=text,
157 | num_generated_tokens=1,
158 | num_input_tokens=len(tokenized_inputs),
159 | num_generated_tokens_batch=1,
160 | num_input_tokens_batch=len(tokenized_inputs),
161 | preprocessing_time=None,
162 | postprocessing_time=None,
163 | generation_time=gen_time,
164 | )
165 | ]
166 |
167 | def preprocess(self, prompts: List[str], **generate_kwargs):
168 | pass
169 |
170 | def forward(self, model_inputs, **generate_kwargs):
171 | pass
172 |
173 | @classmethod
174 | def from_initializer(
175 | cls,
176 | initializer: "LlamaCppInitializer",
177 | model_id: str,
178 | device: Optional[Union[str, int, torch.device]] = None,
179 | **kwargs,
180 | ) -> "LlamaCppPipeline":
181 | assert isinstance(initializer, LlamaCppInitializer)
182 | logger.info(f"LlamaCppPipeline initializer loading model: {model_id}")
183 | model, tokenizer = initializer.load(model_id)
184 | logger.info(f"LlamaCppPipeline loaded model: {model}")
185 | return cls(
186 | model,
187 | tokenizer,
188 | device=device,
189 | **kwargs,
190 | )
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/text_generation_AdvertiseGen.py:
--------------------------------------------------------------------------------
1 | from ._base import Task
2 | from transformers import AutoModel, DataCollatorForSeq2Seq, AutoModelForCausalLM
3 | from typing import Any
4 | import pandas as pd
5 | import numpy as np
6 | import jieba
7 | from rouge_chinese import Rouge
8 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
9 |
10 | class NoheaderAdvertiseGen(Task):
11 | # AUTO_MODEL_CLASS = AutoModel
12 | AUTO_MODEL_CLASS = AutoModelForCausalLM
13 |
14 | DATASET_PATH = "AdvertiseGen"
15 | prompt_column = "content"
16 | response_column = "summary"
17 | # history_column = "history"
18 |
19 | def get_data_proprocess(self) -> Any:
20 | self.prompt_column = self.ft_config.data_config.input_columns[0]
21 | self.response_column = self.ft_config.data_config.validation_column
22 | self.DATASET_PATH = self.ft_config.data_config.data_path
23 | tokenizer = self.tokenizer
24 | max_length = self.ft_config.train_config.base_config.max_length
25 | # adopt python decorator TODO
26 | def preprocess_function(examples):
27 | # examples = examples.to_dict("list")
28 | #-- start
29 | max_source_length = int(max_length / 2)
30 | max_target_length = max_length - max_source_length
31 | max_source_length = 64
32 | max_target_length = 128
33 | max_seq_length = max_source_length + max_target_length + 1
34 |
35 | model_inputs = {
36 | "input_ids": [],
37 | "labels": [],
38 | }
39 | prefix = ""
40 | for i in range(len(examples[self.prompt_column])):
41 | if examples[self.prompt_column][i] and examples[self.response_column][i]:
42 | query, answer = examples[self.prompt_column][i], examples[self.response_column][i]
43 |
44 | # history = examples[history_column][i] if history_column is not None else None
45 | # history = None
46 | # prompt = tokenizer.build_prompt(query, history)
47 |
48 | prompt = prefix + query
49 | print(f"tokenizer is: {tokenizer}")
50 | a_ids = tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True, padding=True,
51 | max_length=max_source_length)
52 | b_ids = tokenizer.encode(text=answer, add_special_tokens=False, truncation=True, padding=True,
53 | max_length=max_target_length)
54 |
55 | context_length = len(a_ids)
56 | input_ids = a_ids + b_ids + [tokenizer.eos_token_id]
57 | labels = [tokenizer.pad_token_id] * context_length + b_ids + [tokenizer.eos_token_id]
58 |
59 | pad_len = max_seq_length - len(input_ids)
60 | input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
61 | labels = labels + [tokenizer.pad_token_id] * pad_len
62 |
63 | # if data_args.ignore_pad_token_for_loss:
64 | # labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
65 | # labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
66 |
67 | model_inputs["input_ids"].append(input_ids)
68 | model_inputs["labels"].append(labels)
69 |
70 | return model_inputs
71 |
72 | return preprocess_function
73 |
74 | def get_eval_preprocess(self) -> Any:
75 | tokenizer = self.tokenizer
76 | def preprocess_function_eval(examples):
77 | max_source_length = 64
78 | max_target_length = 128
79 | inputs, targets = [], []
80 | prefix = ""
81 | for i in range(len(examples[self.prompt_column])):
82 | if examples[self.prompt_column][i] and examples[self.response_column][i]:
83 | query = examples[self.prompt_column][i]
84 | # history = examples[history_column][i] if history_column is not None else None
85 | # history = None
86 | # prompt = tokenizer.build_prompt(query, history)
87 | inputs.append(query)
88 | targets.append(examples[self.response_column][i])
89 |
90 | inputs = [prefix + inp for inp in inputs]
91 | model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True, padding=True)
92 | labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
93 |
94 | # if data_args.ignore_pad_token_for_loss:
95 | # labels["input_ids"] = [
96 | # [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
97 | # ]
98 | model_inputs["labels"] = labels["input_ids"]
99 |
100 | return model_inputs
101 |
102 | return preprocess_function_eval
103 |
104 | def get_compute_metrics(self) -> Any:
105 | tokenizer = self.tokenizer
106 |
107 | def compute_metrics(eval_preds):
108 | preds, labels = eval_preds
109 | if isinstance(preds, tuple):
110 | preds = preds[0]
111 | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
112 |
113 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
114 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
115 |
116 | score_dict = {
117 | "rouge-1": [],
118 | "rouge-2": [],
119 | "rouge-l": [],
120 | "bleu-4": []
121 | }
122 | for pred, label in zip(decoded_preds, decoded_labels):
123 | hypothesis = list(jieba.cut(pred))
124 | reference = list(jieba.cut(label))
125 | rouge = Rouge()
126 | scores = rouge.get_scores(' '.join(hypothesis) , ' '.join(reference))
127 | result = scores[0]
128 |
129 | for k, v in result.items():
130 | score_dict[k].append(round(v["f"] * 100, 4))
131 | bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
132 | score_dict["bleu-4"].append(round(bleu_score * 100, 4))
133 |
134 | for k, v in score_dict.items():
135 | score_dict[k] = float(np.mean(v))
136 | return score_dict
137 |
138 | return compute_metrics
139 |
140 | def get_data_collator(self) -> Any:
141 | # label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
142 | label_pad_token_id = self.tokenizer.pad_token_id
143 | data_collator = DataCollatorForSeq2Seq(
144 | tokenizer=self.tokenizer,
145 | model=self.model,
146 | label_pad_token_id=label_pad_token_id,
147 | pad_to_multiple_of=None,
148 | # padding=True
149 | padding=False
150 | )
151 | return data_collator
152 |
153 | def training_key(self):
154 | """
155 | :return: Iterable[obj]
156 | A iterable of any object, that doc_to_text can handle
157 | """
158 | return "train"
159 |
160 | def validation_key(self):
161 | """
162 | :return: Iterable[obj]
163 | A iterable of any object, that doc_to_text can handle
164 | """
165 | return "validation"
166 |
167 | def getTrainDataSet(self):
168 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column])
169 |
170 | def getEvalDataSet(self):
171 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column])
172 |
173 | def getSmallTrainDataSet(self, len: int):
174 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column])
175 |
176 | def getSmallEvalDataSet(self, len: int):
177 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column])
178 | # return self.dataset[self.validation_key()].select(range(len)).map(self.get_eval_preprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column])
179 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/ray_train.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import numpy as np
4 | import torch
5 | from ._base import BaseFT
6 | from llmadmin.backend.server.models import FTApp
7 |
8 | from datasets import load_dataset
9 | from transformers import AutoTokenizer
10 | import ray.data
11 | import torch
12 | import numpy as np
13 |
14 | from datasets import load_metric
15 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
16 |
17 | import ray.train
18 | from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback
19 | from ray.train.torch import TorchTrainer
20 | from ray.train import RunConfig, ScalingConfig, CheckpointConfig, FailureConfig
21 | from llmadmin.backend.logger import get_logger
22 |
23 | logger = get_logger(__name__)
24 |
25 | # GLUE_TASKS = [
26 | # "cola",
27 | # "mnli",
28 | # "mnli-mm",
29 | # "mrpc",
30 | # "qnli",
31 | # "qqp",
32 | # "rte",
33 | # "sst2",
34 | # "stsb",
35 | # "wnli",
36 | # ]
37 |
38 | class RayTrain(BaseFT):
39 |
40 | def __init__(self, ftApp: FTApp):
41 | self.init_model_dataset()
42 | super().__init__(ftapp=ftApp)
43 |
44 | def init_model_dataset(self):
45 | self.use_gpu = False # set this to False to run on CPUs
46 | self.num_workers = 2 # set this to number of GPUs or CPUs you want to use
47 | logger.info(f"Is CUDA available: {torch.cuda.is_available()}")
48 | logger.info(f"init model and dataset with num_workers={self.num_workers}, use_gpu={self.use_gpu}")
49 | self.task_to_keys = {
50 | "cola": ("sentence", None),
51 | "mnli": ("premise", "hypothesis"),
52 | "mnli-mm": ("premise", "hypothesis"),
53 | "mrpc": ("sentence1", "sentence2"),
54 | "qnli": ("question", "sentence"),
55 | "qqp": ("question1", "question2"),
56 | "rte": ("sentence1", "sentence2"),
57 | "sst2": ("sentence", None),
58 | "stsb": ("sentence1", "sentence2"),
59 | "wnli": ("sentence1", "sentence2"),
60 | }
61 | self.task = "cola"
62 | self.actual_task = "mnli" if self.task == "mnli-mm" else self.task
63 | self.model_checkpoint = "/Users/hhwang/models/distilbert-base-uncased"
64 |
65 | logger.info(f"begin load model {self.model_checkpoint}")
66 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint, use_fast=True)
67 | self.num_labels = 3 if self.task.startswith("mnli") else 1 if self.task == "stsb" else 2
68 | self.batch_size = 2
69 |
70 | dataset_path = "glue"
71 | logger.info(f"begin load dataset {dataset_path} -> {self.actual_task}")
72 | datasets = load_dataset(dataset_path, self.actual_task)
73 | logger.info(f"loaded datasets: {datasets}")
74 | item_count = 20
75 | logger.info(f"convert {item_count} records to ray dataset")
76 | self.ray_datasets = {
77 | "train": ray.data.from_huggingface(datasets["train"].select(range(item_count))),
78 | "validation": ray.data.from_huggingface(datasets["validation"].select(range(item_count))),
79 | "test": ray.data.from_huggingface(datasets["test"].select(range(item_count))),
80 | }
81 | self.train_count = self.ray_datasets["train"].count()
82 | self.validation_count = self.ray_datasets["validation"].count()
83 | self.test_count = self.ray_datasets["test"].count()
84 | logger.info(f"dataset train count: {self.train_count}")
85 | logger.info(f"dataset validation count: {self.validation_count}")
86 | logger.info(f"dataset test count: {self.test_count}")
87 | model_name = self.model_checkpoint.split("/")[-1]
88 | self.name = f"{model_name}-finetuned-{self.task}"
89 | logger.info(f"output model dir: {self.name}")
90 |
91 | # Tokenize input sentences
92 | def collate_fn(self, examples: Dict[str, np.array]):
93 | sentence1_key, sentence2_key = self.task_to_keys[self.task]
94 | if sentence2_key is None:
95 | outputs = self.tokenizer(
96 | list(examples[sentence1_key]),
97 | truncation=True,
98 | padding="longest",
99 | return_tensors="pt",
100 | )
101 | else:
102 | outputs = self.tokenizer(
103 | list(examples[sentence1_key]),
104 | list(examples[sentence2_key]),
105 | truncation=True,
106 | padding="longest",
107 | return_tensors="pt",
108 | )
109 | outputs["labels"] = torch.LongTensor(examples["label"])
110 |
111 | if self.use_gpu:
112 | # Move all input tensors to GPU
113 | for key, value in outputs.items():
114 | outputs[key] = value.cuda()
115 |
116 | return outputs
117 |
118 | def train_func(self, config):
119 | # Calculate the maximum steps per epoch based on the number of rows in the training dataset.
120 | # Make sure to scale by the total number of training workers and the per device batch size.
121 | max_steps_per_epoch = self.ray_datasets["train"].count() // (self.batch_size * self.num_workers)
122 | logger.info(f"max_steps_per_epoch: {max_steps_per_epoch}, batch_size: {self.batch_size}, num_workers: {self.num_workers}")
123 |
124 | # metric = load_metric("glue", self.actual_task)
125 | tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint, use_fast=True)
126 | model = AutoModelForSequenceClassification.from_pretrained(
127 | self.model_checkpoint, num_labels=self.num_labels
128 | )
129 |
130 | train_ds = ray.train.get_dataset_shard("train")
131 | eval_ds = ray.train.get_dataset_shard("eval")
132 |
133 | train_ds_iterable = train_ds.iter_torch_batches(
134 | batch_size=self.batch_size, collate_fn=self.collate_fn
135 | )
136 | eval_ds_iterable = eval_ds.iter_torch_batches(
137 | batch_size=self.batch_size, collate_fn=self.collate_fn
138 | )
139 |
140 | args = TrainingArguments(
141 | self.name,
142 | evaluation_strategy="epoch",
143 | save_strategy="epoch",
144 | logging_strategy="epoch",
145 | per_device_train_batch_size=self.batch_size,
146 | per_device_eval_batch_size=self.batch_size,
147 | learning_rate=config.get("learning_rate", 2e-5),
148 | num_train_epochs=config.get("epochs", 2),
149 | weight_decay=config.get("weight_decay", 0.01),
150 | push_to_hub=False,
151 | max_steps=max_steps_per_epoch * config.get("epochs", 2),
152 | disable_tqdm=True, # declutter the output a little
153 | use_cpu=not self.use_gpu, # you need to explicitly set no_cuda if you want CPUs
154 | report_to="none",
155 | )
156 |
157 | # def compute_metrics(eval_pred):
158 | # predictions, labels = eval_pred
159 | # if self.task != "stsb":
160 | # predictions = np.argmax(predictions, axis=1)
161 | # else:
162 | # predictions = predictions[:, 0]
163 | # return metric.compute(predictions=predictions, references=labels)
164 |
165 | trainer = Trainer(
166 | model,
167 | args,
168 | train_dataset=train_ds_iterable,
169 | eval_dataset=eval_ds_iterable,
170 | tokenizer=tokenizer,
171 | # compute_metrics=compute_metrics,
172 | )
173 |
174 | trainer.add_callback(RayTrainReportCallback())
175 |
176 | trainer = prepare_trainer(trainer)
177 |
178 | logger.info("Starting training")
179 | trainer.train()
180 |
181 | def train(self):
182 | # metric_name = (
183 | # "pearson"
184 | # if self.task == "stsb"
185 | # else "matthews_correlation"
186 | # if self.task == "cola"
187 | # else "accuracy"
188 | # )
189 |
190 | # validation_key = (
191 | # "validation_mismatched"
192 | # if self.task == "mnli-mm"
193 | # else "validation_matched"
194 | # if self.task == "mnli"
195 | # else "validation"
196 | # )
197 | logger.info(f"build ray TorchTrainer")
198 |
199 | trainer = TorchTrainer(
200 | self.train_func,
201 | scaling_config=ScalingConfig(num_workers=self.num_workers, use_gpu=self.use_gpu),
202 | datasets={
203 | "train": self.ray_datasets["train"],
204 | "eval": self.ray_datasets["validation"],
205 | },
206 | run_config=RunConfig(
207 | checkpoint_config=CheckpointConfig(
208 | num_to_keep=1,
209 | checkpoint_score_attribute="eval_loss",
210 | checkpoint_score_order="min",
211 | ),
212 | failure_config=FailureConfig(
213 | max_failures=5
214 | )
215 | ),
216 | )
217 |
218 | logger.info(f"begin ray train fit")
219 | result = trainer.fit()
220 | logger.info(f"end ray train fit")
221 | logger.info(f"result: {result}")
222 |
223 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/initializers/hf_transformers/deepspeed.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from pathlib import Path
4 | from typing import Any, Dict, Optional, Tuple
5 |
6 | import deepspeed
7 | import torch
8 | from huggingface_hub import snapshot_download
9 | from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedModel
10 |
11 | from llmadmin.backend.logger import get_logger
12 |
13 | from .base import TransformersInitializer
14 |
15 | logger = get_logger(__name__)
16 |
17 |
18 | # TODO: Allow deepspeed kwargs
19 | class DeepSpeedInitializer(TransformersInitializer):
20 | """Initialize model (with DeepSpeed) and tokenizer and place them on the correct device.
21 |
22 | Args:
23 | device (torch.device): Device to place model and tokenizer on.
24 | world_size (int): Number of GPUs to use.
25 | dtype (torch.dtype, optional): Data type to use. Defaults to torch.float16.
26 | use_bettertransformer (bool, optional): Whether to use BetterTransformer. Defaults to False.
27 | torch_compile (Optional[Dict[str, Any]], optional): Parameters for ``torch.compile``. Defaults to None.
28 | max_tokens (int, optional): Maximum number of tokens to use. Defaults to 1024.
29 | use_kernel (bool, optional): Whether to use the DeepSpeed kernel injection. Defaults to False.
30 | use_meta_tensor (bool, optional): Whether to use meta tensor loading method. Defaults to False.
31 | injection_policy ([type], optional): Injection policy for DeepSpeed AutoTP. Cannot
32 | be set if use_kernel=True. Defaults to None.
33 | ds_inference_kwargs (Dict[str, Any], optional): Other keyword arguments for ``deepspeed.initialize``.
34 | Specific arguments in the signature of this function will override these values.
35 | **from_pretrained_kwargs: Keyword arguments for ``AutoModel.from_pretrained``.
36 | """
37 |
38 | def __init__(
39 | self,
40 | device: torch.device,
41 | world_size: int,
42 | dtype: torch.dtype = torch.float16,
43 | use_bettertransformer: bool = False,
44 | torch_compile: Optional[Dict[str, Any]] = None,
45 | max_tokens: int = 1024,
46 | use_kernel: bool = False,
47 | use_meta_tensor: bool = False,
48 | injection_policy=None,
49 | ds_inference_kwargs: Optional[Dict[str, Any]] = None,
50 | **from_pretrained_kwargs,
51 | ):
52 | super().__init__(
53 | device=device,
54 | world_size=world_size,
55 | dtype=dtype,
56 | use_bettertransformer=use_bettertransformer,
57 | torch_compile=torch_compile,
58 | **from_pretrained_kwargs,
59 | )
60 | self.max_tokens = max_tokens
61 | self.use_kernel = use_kernel
62 | self.use_meta_tensor = use_meta_tensor
63 | # TODO: Allow conversion from strings (need to do dynamic imports)
64 | self.injection_policy = injection_policy
65 | self.ds_inference_kwargs = ds_inference_kwargs
66 |
67 | if self.use_kernel:
68 | assert not (self.use_bettertransformer or self.torch_compile)
69 |
70 | if self.use_meta_tensor:
71 | assert self.use_kernel
72 |
73 | def _get_model_from_pretrained_kwargs(self):
74 | return dict(
75 | low_cpu_mem_usage=True,
76 | torch_dtype=self.dtype,
77 | **self.from_pretrained_kwargs,
78 | )
79 |
80 | # From https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/utils.py
81 | def _generate_checkpoint_json(
82 | self, model_id: str, checkpoint_path: Optional[str] = None
83 | ) -> Tuple[str, str]:
84 | if checkpoint_path is None:
85 | repo_root = snapshot_download(
86 | model_id,
87 | allow_patterns=["*"],
88 | ignore_patterns=["*.safetensors", "*.h5", "*.msgpack"],
89 | local_files_only=False,
90 | revision=None,
91 | )
92 | else:
93 | assert os.path.exists(
94 | checkpoint_path
95 | ), f"Checkpoint path {checkpoint_path} does not exist"
96 | repo_root = checkpoint_path
97 |
98 | if os.path.exists(os.path.join(repo_root, "ds_inference_config.json")):
99 | checkpoints_json = os.path.join(repo_root, "ds_inference_config.json")
100 | elif model_id in [
101 | "microsoft/bloom-deepspeed-inference-int8",
102 | "microsoft/bloom-deepspeed-inference-fp16",
103 | ]:
104 | # tp presharded repos come with their own checkpoints config file
105 | checkpoints_json = os.path.join(repo_root, "ds_inference_config.json")
106 | else:
107 | checkpoints_json = os.path.join(repo_root, "checkpoints.json")
108 |
109 | with open(checkpoints_json, "w", encoding="utf-8") as f:
110 | file_list = [
111 | str(entry).split("/")[-1]
112 | for entry in Path(repo_root).rglob("*.[bp][it][n]")
113 | if entry.is_file()
114 | ]
115 | data = {"type": "BLOOM", "checkpoints": file_list, "version": 1.0}
116 | json.dump(data, f)
117 |
118 | return os.path.abspath(repo_root), os.path.abspath(checkpoints_json)
119 |
120 | def load_model(self, model_id: str) -> "PreTrainedModel":
121 | model_id_or_path = self._get_model_location_on_disk(model_id)
122 | from_pretrained_kwargs = self._get_model_from_pretrained_kwargs()
123 |
124 | logger.info(f"Loading model {model_id_or_path}...")
125 | if self.use_meta_tensor:
126 | logger.info("Loading model using DeepSpeed meta tensor...")
127 |
128 | try:
129 | config = AutoConfig.from_pretrained(
130 | model_id_or_path, **from_pretrained_kwargs
131 | )
132 | except OSError:
133 | if model_id_or_path != model_id:
134 | logger.warning(
135 | f"Couldn't load model from derived path {model_id_or_path}, "
136 | f"trying to load from model_id {model_id}"
137 | )
138 | config = AutoConfig.from_pretrained(
139 | model_id, **from_pretrained_kwargs
140 | )
141 | else:
142 | raise
143 |
144 | self._repo_root, self._checkpoints_json = self._generate_checkpoint_json(
145 | model_id
146 | )
147 |
148 | with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
149 | model = AutoModelForCausalLM.from_config(config)
150 | else:
151 | try:
152 | model = AutoModelForCausalLM.from_pretrained(
153 | model_id_or_path, **from_pretrained_kwargs
154 | )
155 | except OSError:
156 | if model_id_or_path != model_id:
157 | logger.warning(
158 | f"Couldn't load model from derived path {model_id_or_path}, "
159 | f"trying to load from model_id {model_id}"
160 | )
161 | model = AutoModelForCausalLM.from_pretrained(
162 | model_id, **from_pretrained_kwargs
163 | )
164 | else:
165 | raise
166 | model.eval()
167 | return model
168 |
169 | def postprocess_model(self, model: "PreTrainedModel") -> "PreTrainedModel":
170 | from transformers import GPTNeoXForCausalLM, LlamaForCausalLM
171 |
172 | injection_policy = self.injection_policy
173 | # TODO: remove those later when deepspeed master is updated
174 | if injection_policy is None and not self.use_kernel:
175 | if isinstance(model, GPTNeoXForCausalLM):
176 | from transformers import GPTNeoXLayer
177 |
178 | injection_policy = {
179 | GPTNeoXLayer: ("attention.dense", "mlp.dense_4h_to_h")
180 | }
181 | elif isinstance(model, LlamaForCausalLM):
182 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer
183 |
184 | injection_policy = {
185 | LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")
186 | }
187 |
188 | if self.use_bettertransformer:
189 | from optimum.bettertransformer import BetterTransformer
190 |
191 | logger.info("Transforming the model with BetterTransformer...")
192 | model = BetterTransformer.transform(model)
193 |
194 | ds_kwargs = self.ds_inference_kwargs or {}
195 | ds_kwargs = ds_kwargs.copy()
196 | ds_kwargs.update(
197 | dict(
198 | dtype=self.dtype,
199 | mp_size=self.world_size,
200 | replace_with_kernel_inject=self.use_kernel,
201 | injection_policy=injection_policy,
202 | max_tokens=self.max_tokens,
203 | )
204 | )
205 | if self.use_meta_tensor:
206 | ds_kwargs.update(
207 | dict(base_dir=self._repo_root, checkpoint=self._checkpoints_json)
208 | )
209 |
210 | logger.info(f"deepspeed.init_inference kwargs: {ds_kwargs}")
211 | model = deepspeed.init_inference(
212 | model,
213 | **ds_kwargs,
214 | )
215 |
216 | if self.torch_compile and self.torch_compile["backend"]:
217 | logger.info("Compiling the model with torch.compile()...")
218 | model = torch.compile(model, **self.torch_compile)
219 |
220 | # Add attributes for compatibility with the pipeline
221 | model.use_kernel = self.use_kernel
222 | model.device = self.device
223 | model = model.to(self.device)
224 | return model
225 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/utils.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os
3 | import subprocess
4 | import time
5 | import traceback
6 | from collections import defaultdict
7 | from functools import wraps
8 | from typing import List, Optional
9 |
10 | from transformers import AutoConfig
11 | import torch.distributed as dist
12 | from filelock import FileLock
13 | from ray.air.util.torch_dist import (
14 | ActorHandle,
15 | _get_node_and_gpu_ids,
16 | _init_torch_distributed,
17 | get_address_and_port,
18 | )
19 | from torch.hub import _get_torch_home
20 |
21 | from llmadmin.backend.logger import get_logger
22 | from llmadmin.backend.server.models import S3MirrorConfig
23 |
24 | logger = get_logger(__name__)
25 |
26 |
27 | def download_model(
28 | model_id: str,
29 | endpoint_url: str,
30 | bucket_uri: str,
31 | s3_sync_args: Optional[List[str]] = None,
32 | ) -> None:
33 | """
34 | Download a model from an S3 bucket and save it in TRANSFORMERS_CACHE for
35 | seamless interoperability with Hugging Face's Transformers library.
36 |
37 | The downloaded model must have a 'hash' file containing the commit hash corresponding
38 | to the commit on Hugging Face Hub.
39 | """
40 | from transformers.utils.hub import TRANSFORMERS_CACHE
41 |
42 | isAutoLoadConfigSuccess = False
43 | modelConfig = None
44 | try:
45 | modelConfig = AutoConfig.from_pretrained(
46 | model_id, trust_remote_code=True)
47 | isAutoLoadConfigSuccess = True
48 | except Exception:
49 | isAutoLoadConfigSuccess = False
50 |
51 | if modelConfig and isAutoLoadConfigSuccess:
52 | logger.info(f"Model exist and success to load AutoConfig from_pretrained '{model_id}'")
53 | return
54 | else:
55 | logger.info(f"Fail to load AutoConfig from_pretrained '{model_id}'")
56 |
57 | s3_sync_args = s3_sync_args or []
58 | logger.info(f"Downloading '{model_id}' from '{bucket_uri}' to '{TRANSFORMERS_CACHE}'")
59 | path = os.path.expanduser(os.path.join(TRANSFORMERS_CACHE, f"models--{model_id.replace('/', '--')}"))
60 |
61 | isS3 = bucket_uri.startswith('s3://')
62 | if isS3:
63 | model_hash_file = os.path.join(bucket_uri, "hash")
64 | if endpoint_url:
65 | logger.info(f"Downloading '{model_id}' hash from server '{endpoint_url}' '{model_hash_file}' ")
66 | subprocess.run(["aws", "--endpoint-url", endpoint_url, "s3", "cp", "--quiet"] + s3_sync_args + [model_hash_file, "."])
67 | else:
68 | logger.info(f"Downloading '{model_id}' hash from '{model_hash_file}' ")
69 | subprocess.run(["aws", "s3", "cp", "--quiet"] + s3_sync_args + [model_hash_file, "."])
70 | else:
71 | model_hash_file = bucket_uri + "hash"
72 | logger.info(f"Downloading '{model_id}' hash from '{model_hash_file}' ")
73 | subprocess.run(["cp -rf " + model_hash_file + " ."], shell=True)
74 |
75 | if not os.path.exists(os.path.join(".", "hash")):
76 | raise RuntimeError("Hash file not found in the bucket or bucket could not have been downloaded.")
77 |
78 | with open(os.path.join(".", "hash"), "r") as f:
79 | f_hash = f.read().strip()
80 |
81 | model_cache_path = os.path.join(path, "snapshots", f_hash)
82 |
83 | model_config_file = os.path.join(model_cache_path, "config.json")
84 | if os.path.exists(model_config_file):
85 | logger.info(f"Skip download model '{model_id}' due to config '{model_config_file}' exist")
86 | return
87 |
88 | subprocess.run(["mkdir", "-p", model_cache_path])
89 | subprocess.run(["mkdir", "-p", os.path.join(path, "refs")])
90 |
91 | logger.info(f"Downloading '{model_id}' files from '{bucket_uri}' to '{model_cache_path}'")
92 | if isS3:
93 | if endpoint_url:
94 | subprocess.run([ "aws", "--endpoint-url", endpoint_url, "s3", "sync", "--quiet"] + s3_sync_args + [bucket_uri, model_cache_path])
95 | else:
96 | subprocess.run([ "aws", "s3", "sync", "--quiet"] + s3_sync_args + [bucket_uri, model_cache_path])
97 | else:
98 | subprocess.run(["cp -rf " + bucket_uri + "*" + " " + model_cache_path], shell=True)
99 |
100 | with open(os.path.join(path, "refs", "main"), "w") as f:
101 | f.write(f_hash)
102 |
103 | def timeit(func):
104 | """
105 | Decorator to time a function.
106 | """
107 |
108 | @wraps(func)
109 | def inner(*args, **kwargs):
110 | start_time = time.monotonic()
111 | ret = func(*args, **kwargs)
112 | time_taken = time.monotonic() - start_time
113 | logger.info(f"LLM time counting fun {func} took {time_taken} s to complete")
114 | return ret
115 |
116 | return inner
117 |
118 |
119 | def initialize_node(
120 | model_id: Optional[str] = None,
121 | s3_mirror_config: Optional[S3MirrorConfig] = None,
122 | ):
123 | """
124 | Performn initialization for a node.
125 |
126 | Currently, that means downloading the model from the S3 bucket.
127 | """
128 | # Create the torch cache kernels directory if it doesn't exist.
129 | # This is a workaround for a torch issue, where the kernels directory
130 | # cannot be created by torch if the parent directory doesn't exist.
131 | torch_cache_home = _get_torch_home()
132 | os.makedirs(os.path.join(torch_cache_home, "kernels"), exist_ok=True)
133 |
134 | if model_id and s3_mirror_config and s3_mirror_config.bucket_uri:
135 | lock_path = os.path.expanduser(f"~/{model_id.replace('/', '--')}.lock")
136 | try:
137 | # Timeout 0 means there will be only one attempt to acquire
138 | # the file lock. If it cannot be aquired, a TimeoutError
139 | # will be thrown.
140 | # This allows us to make sure that subsequent processes don't
141 | # duplicate work.
142 | with FileLock(lock_path, timeout=0):
143 | endpoint_url = s3_mirror_config.endpoint_url
144 | bucket_uri = s3_mirror_config.bucket_uri
145 | s3_sync_args = s3_mirror_config.s3_sync_args
146 | try:
147 | download_model(model_id, endpoint_url, bucket_uri, s3_sync_args=s3_sync_args)
148 | logger.info("Done downloading the model from bucket!")
149 | except RuntimeError:
150 | logger.warning(
151 | f"Unable to download the model from bucket. Traceback:\n {traceback.format_exc()}"
152 | )
153 | except TimeoutError:
154 | # if the directory is already locked, then wait but do not do anything.
155 | with FileLock(lock_path, timeout=-1):
156 | pass
157 |
158 |
159 | def merge_dicts(overwrite: dict, base: dict) -> dict:
160 | """
161 | Merge two dictionaries recursively, with keys from overwrite taking precedence.
162 | """
163 | base = base.copy()
164 | for key, value in overwrite.items():
165 | if isinstance(value, dict):
166 | # get node or create one
167 | node = base.setdefault(key, {})
168 | merge_dicts(value, node)
169 | else:
170 | base[key] = value
171 |
172 | return base
173 |
174 |
175 | async def init_torch_dist_process_group_async(
176 | workers: List[ActorHandle],
177 | backend: str = "gloo",
178 | init_method: str = "env",
179 | ) -> List[int]:
180 | """Initialize a torch distributed process group asynchronously.
181 |
182 | This is identical to
183 | ``ray.air.util.torch_dist.init_torch_dist_process_group``
184 | but uses asyncio to avoid blocking the event loop.
185 |
186 | Note: this util assumes that the order of the workers passed in
187 | are their global ranks.
188 |
189 | Args:
190 | workers: A list of TorchDistributedWorker actors.
191 | backend: The torch distributed backend to use,
192 | possible choices are "gloo" or "nccl".
193 | init_method: The initialization method to use,
194 | possible choices are "env" or "tcp".
195 |
196 | Returns:
197 | Local ranks on their respective nodes for the list of workers.
198 | """
199 | if not dist.is_available():
200 | raise RuntimeError("Distributed torch is not available.")
201 |
202 | # Build a map from node_id to workers on that node.
203 | node_and_gpu_ids = await asyncio.gather(
204 | *[w.execute.remote(_get_node_and_gpu_ids) for w in workers]
205 | )
206 | # All the workers on a specific node.
207 | node_to_workers = defaultdict(list)
208 | # All the gpu ids visible to all the workers on a specific node.
209 | node_to_gpu_ids = defaultdict(set)
210 | for i, (node_id, gpu_ids) in enumerate(node_and_gpu_ids):
211 | node_to_workers[node_id].append(i)
212 | # Force list.
213 | if not isinstance(gpu_ids, list):
214 | gpu_ids = [gpu_ids]
215 | # It is possible for a worker to have access to multiple GPUs.
216 | for gpu_id in gpu_ids:
217 | node_to_gpu_ids[node_id].add(gpu_id)
218 |
219 | # Assume the first worker is the master.
220 | master_addr, master_port = (
221 | await asyncio.gather(workers[0].execute.remote(get_address_and_port))
222 | )[0]
223 |
224 | setup_futures = []
225 | world_size = len(workers)
226 | local_ranks = []
227 | for rank, worker in enumerate(workers):
228 | node_id = node_and_gpu_ids[rank][0]
229 | local_rank = node_to_workers[node_id].index(rank)
230 | local_world_size = len(node_to_workers[node_id])
231 | setup_futures.append(
232 | worker.execute.remote(
233 | _init_torch_distributed,
234 | init_method=init_method,
235 | backend=backend,
236 | rank=rank,
237 | world_size=world_size,
238 | local_rank=local_rank,
239 | local_world_size=local_world_size,
240 | master_addr=master_addr,
241 | master_port=master_port,
242 | # list(set) will sort the gpu ids, so VISIBLE_CUDA_DEVICES
243 | # is always sorted.
244 | gpu_ids=list(node_to_gpu_ids[node_id]),
245 | )
246 | )
247 | local_ranks.append(local_rank)
248 |
249 | # Wait for all workers to join the process group.
250 | await asyncio.gather(*setup_futures)
251 |
252 | return local_ranks
253 |
--------------------------------------------------------------------------------
/llmadmin/api/cli.py:
--------------------------------------------------------------------------------
1 | # import ast
2 | # import json
3 | from typing import Annotated, Optional
4 |
5 | import typer
6 | # from rich import print as rp
7 | # from rich.console import Console
8 | # from rich.progress import Progress, SpinnerColumn, TextColumn
9 | # from rich.table import Table
10 | from ray.serve._private.constants import DEFAULT_HTTP_PORT
11 | from llmadmin.api import sdk
12 |
13 | app = typer.Typer()
14 |
15 | model_type = typer.Option(
16 | default=..., help="The model to use. You can specify multiple models."
17 | )
18 |
19 | ft_define = typer.Option(
20 | default=..., help="the fine tune yaml file"
21 | )
22 |
23 | app_name = typer.Option(
24 | default=..., help="The name of ray serve application."
25 | )
26 | host = typer.Option(
27 | default=..., help="The host ip address of ray api server."
28 | )
29 | port = typer.Option(
30 | default=...,help="The port of api server."
31 | )
32 | prompt_type = typer.Option(help="Prompt to query")
33 | stats_type = typer.Option(help="Whether to print generated statistics")
34 | prompt_file_type = typer.Option(
35 | default=..., help="File containing prompts. A simple text file"
36 | )
37 | separator_type = typer.Option(help="Separator used in prompt files")
38 | results_type = typer.Option(help="Where to save the results")
39 | file_type = typer.Option(default=..., help="The flow graph")
40 | port_type = typer.Option(default=..., help="The port of service.")
41 | apiserver_scale_type = typer.Option(default=..., help="A string of dict for scaling service. for example: --scale-config=min_replicas=1,max_replicas=5")
42 | apiserver_resource_type = typer.Option(default=..., help="A string of dict for resource requirement. for example: --resource-config=num_cpus=1")
43 |
44 | @app.command()
45 | def start_apiserver(
46 | port: Annotated[Optional[int], port_type] = DEFAULT_HTTP_PORT,
47 | resource_config: Annotated[str, apiserver_resource_type] = None,
48 | scale_config: Annotated[str, apiserver_scale_type] = None
49 | ):
50 | """Start a api server, it will provide apis.
51 | Args:
52 | *host: The host ip to run.
53 | *port: The port to run.
54 | """
55 | sdk.start_apiserver(port=port, resource_config=resource_config, scale_config=scale_config)
56 |
57 | @app.command()
58 | def run_ft(ft: Annotated[str, ft_define]):
59 | """Start a fine tune process.
60 |
61 | Args:
62 | *model: The model to run.
63 | """
64 | sdk.run_ft(ft)
65 |
66 | @app.command()
67 | def ray_ft(model: Annotated[str, ft_define]):
68 | """Start a fine tune ray process.
69 |
70 | Args:
71 | *model: The model to run.
72 | """
73 | sdk.run_ray_ft(model)
74 |
75 | # @app.command()
76 | # def list_models(metadata: Annotated[bool, "Whether to print metadata"] = False):
77 | # """Get a list of the available models"""
78 | # result = sdk.models()
79 | # if metadata:
80 | # for model in result:
81 | # rp(f"[bold]{model}:[/]")
82 | # rp(sdk.metadata(model))
83 | # else:
84 | # print("\n".join(result))
85 |
86 |
87 | # def _print_result(result, model, print_stats):
88 | # rp(f"[bold]{model}:[/]")
89 | # if print_stats:
90 | # rp("[bold]Stats:[/]")
91 | # rp(result)
92 | # else:
93 | # rp(result)
94 |
95 |
96 | # def progress_spinner():
97 | # return Progress(
98 | # SpinnerColumn(),
99 | # TextColumn("[progress.description]{task.description}"),
100 | # transient=True,
101 | # )
102 |
103 |
104 | # @app.command()
105 | # def query(
106 | # model: Annotated[List[str], model_type],
107 | # prompt: Annotated[Optional[List[str]], prompt_type] = None,
108 | # prompt_file: Annotated[Optional[str], prompt_file_type] = None,
109 | # separator: Annotated[str, separator_type] = "----",
110 | # output_file: Annotated[str, results_type] = "llmadmin-output.json",
111 | # print_stats: Annotated[bool, stats_type] = False,
112 | # ):
113 | # """Query one or several models with one or multiple prompts,
114 | # optionally read from file, and save the results to a file."""
115 | # with progress_spinner() as progress:
116 | # if prompt_file:
117 | # with open(prompt_file, "r") as f:
118 | # prompt = f.read().split(separator)
119 |
120 | # results = {p: [] for p in prompt}
121 |
122 | # for m in model:
123 | # progress.add_task(
124 | # description=f"Processing all prompts against model: {m}.",
125 | # total=None,
126 | # )
127 | # query_results = sdk.batch_query(m, prompt)
128 | # for result in query_results:
129 | # _print_result(result, m, print_stats)
130 |
131 | # for i, p in enumerate(prompt):
132 | # result = query_results[i]
133 | # text = result
134 | # # del result["generated_text"]
135 | # results[p].append({"model": m, "result": text, "stats": result})
136 |
137 | # progress.add_task(description="Writing output file.", total=None)
138 | # with open(output_file, "w") as f:
139 | # f.write(json.dumps(results, indent=2))
140 |
141 |
142 | # @app.command(deprecated=True, name="batch_query")
143 | # def batch_query(
144 | # model: Annotated[List[str], model_type],
145 | # prompt: Annotated[List[str], prompt_type],
146 | # print_stats: Annotated[bool, stats_type] = False,
147 | # ):
148 | # """Query a model with a batch of prompts."""
149 | # with progress_spinner() as progress:
150 | # for m in model:
151 | # progress.add_task(
152 | # description=f"Processing prompt against {m}...", total=None
153 | # )
154 | # results = sdk.batch_query(m, prompt)
155 | # for result in results:
156 | # _print_result(result, m, print_stats)
157 |
158 |
159 | # @app.command(deprecated=True, name="multi_query")
160 | # def multi_query(
161 | # model: Annotated[List[str], model_type],
162 | # prompt_file: Annotated[str, prompt_file_type],
163 | # separator: Annotated[str, separator_type] = "----",
164 | # output_file: Annotated[str, results_type] = "llmadmin-output.json",
165 | # ):
166 | # """Query one or multiple models with a batch of prompts taken from a file."""
167 |
168 | # with progress_spinner() as progress:
169 | # progress.add_task(
170 | # description=f"Loading your prompts from {prompt_file}.", total=None
171 | # )
172 | # with open(prompt_file, "r") as f:
173 | # prompts = f.read().split(separator)
174 | # results = {prompt: [] for prompt in prompts}
175 |
176 | # for m in model:
177 | # progress.add_task(
178 | # description=f"Processing all prompts against model: {model}.",
179 | # total=None,
180 | # )
181 | # query_results = sdk.batch_query(m, prompts)
182 | # for i, prompt in enumerate(prompts):
183 | # result = query_results[i]
184 | # text = result["generated_text"]
185 | # del result["generated_text"]
186 | # results[prompt].append({"model": m, "result": text, "stats": result})
187 |
188 | # progress.add_task(description="Writing output file.", total=None)
189 | # with open(output_file, "w") as f:
190 | # f.write(json.dumps(results, indent=2))
191 |
192 |
193 | # evaluator_type = typer.Option(help="Which LLM to use for evaluation")
194 |
195 |
196 | # @app.command()
197 | # def run(model: Annotated[List[str], model_type]):
198 | # """Start a model.
199 |
200 | # Args:
201 | # *model: The model to run.
202 | # """
203 | # sdk.run(*model)
204 |
205 | # @app.command()
206 | # def run_experimental(model: Annotated[List[str], model_type]):
207 | # """Start a model for experimental, it will do inference by transformer pipeline.
208 |
209 | # Args:
210 | # *model: The model to run.
211 | # """
212 | # sdk.run_experimental(*model)
213 |
214 | # @app.command()
215 | # def del_serve(appname: Annotated[str, app_name]):
216 | # """Remove a ray serve.
217 |
218 | # Args:
219 | # *model: The model to run.
220 | # """
221 | # sdk.del_experimental(appname)
222 |
223 | # @app.command()
224 | # def run_application(file: Annotated[str, file_type]):
225 | # """Start a model in LLMAdmin for experimental.
226 |
227 | # Args:
228 | # *model: The model to run.
229 | # """
230 | # from pathlib import Path
231 | # # If input is a file path, load JSON from the file
232 | # if isinstance(file, (str, Path)):
233 | # with open(file, "r", encoding="utf-8") as f:
234 | # flow_graph = json.load(f)
235 | # else:
236 | # raise TypeError(
237 | # "Input must be a file path (str)"
238 | # )
239 | # sdk.run_application(flow_graph)
240 |
241 |
242 |
243 | # @app.command()
244 | # def run_comparation():
245 | # """Start frontend for model comparation.
246 |
247 | # Args:
248 | # *model: The model to run.
249 | # """
250 | # sdk.run_comparation()
251 |
252 | # @app.command()
253 | # def evaluate(
254 | # input_file: Annotated[str, results_type] = "llmadmin-output.json",
255 | # evaluation_file: Annotated[str, results_type] = "evaluation-output.json",
256 | # evaluator: Annotated[str, evaluator_type] = "gpt-4",
257 | # ):
258 | # """Evaluate and summarize the results of a multi_query run with a strong
259 | # 'evaluator' LLM like GPT-4.
260 | # The results of the ranking are stored to file and displayed in a table.
261 | # """
262 | # with progress_spinner() as progress:
263 | # progress.add_task(description="Loading the evaluator LLM.", total=None)
264 | # if evaluator == "gpt-4":
265 | # from llmadmin.common.evaluation import GPT
266 |
267 | # eval_model = GPT()
268 | # else:
269 | # raise NotImplementedError(f"No evaluator for {evaluator}")
270 |
271 | # with open(input_file, "r") as f:
272 | # results = json.load(f)
273 |
274 | # for prompt, result_list in results.items():
275 | # progress.add_task(
276 | # description=f"Evaluating results for prompt: {prompt}.", total=None
277 | # )
278 | # evaluation = eval_model.evaluate_results(prompt, result_list)
279 | # try:
280 | # # GPT-4 returns a string with a Python dictionary, hopefully!
281 | # evaluation = ast.literal_eval(evaluation)
282 | # except Exception:
283 | # print(f"Could not parse evaluation: {evaluation}")
284 |
285 | # for i, _res in enumerate(results[prompt]):
286 | # results[prompt][i]["rank"] = evaluation[i]["rank"]
287 |
288 | # progress.add_task(description="Storing evaluations.", total=None)
289 | # with open(evaluation_file, "w") as f:
290 | # f.write(json.dumps(results, indent=2))
291 |
292 | # for prompt in results.keys():
293 | # table = Table(title="Evaluation results (higher ranks are better)")
294 |
295 | # table.add_column("Model", justify="left", style="cyan", no_wrap=True)
296 | # table.add_column("Rank", style="magenta")
297 | # table.add_column("Response", justify="right", style="green")
298 |
299 | # for i, _res in enumerate(results[prompt]):
300 | # model = results[prompt][i]["model"]
301 | # response = results[prompt][i]["result"]
302 | # rank = results[prompt][i]["rank"]
303 | # table.add_row(model, str(rank), response)
304 |
305 | # console = Console()
306 | # console.print(table)
307 |
308 |
309 | if __name__ == "__main__":
310 | app()
311 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2023 Anyscale
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/transformer.py:
--------------------------------------------------------------------------------
1 | from ._base import BaseFT
2 | from abc import ABC, abstractmethod
3 | from llmadmin.backend.logger import get_logger
4 | # from datasets import DatasetDict, Dataset, IterableDatasetDict, IterableDataset
5 | # from typing import Union
6 | from llmadmin.backend.server.models import FTApp
7 | from datasets import load_dataset
8 | from datasets import load_metric
9 | import pandas as pd
10 | # from ray.data.preprocessors import BatchMapper
11 | # import ray
12 | import torch
13 | from transformers import TrainingArguments, Trainer
14 | # import numpy as np
15 | # from ray.train.huggingface import TransformersTrainer
16 | # from ray.air.config import RunConfig, CheckpointConfig
17 | from .utils import parse_task_name
18 | from .tasks import TASK_REGISTRY
19 | from .tasks._base import Task
20 | from .methods.base import get_train_model
21 | # from ray.train.huggingface import TransformersCheckpoint
22 | from .const import CHECKPOINT_PATH
23 | from .callback import CustomCallback
24 |
25 | from llmadmin.backend.llm.utils import initialize_node
26 |
27 | logger = get_logger(__name__)
28 |
29 | class TransformersFT(BaseFT):
30 | def __init__(self, ftApp: FTApp):
31 | super().__init__(ftapp=ftApp)
32 |
33 | def train(self):
34 | self.trainV2()
35 |
36 | # Transformer train only
37 | def trainV2(self):
38 | taskobj: Task = None
39 | task = parse_task_name(self.ftapp)
40 | logger.info(f"TransformersFT.trainV2 finetune task name: '{task}'")
41 | taskcls = TASK_REGISTRY[task]
42 |
43 | if not taskcls:
44 | logger.error(f"Couldn't load defined task from register: '{task}'")
45 | raise
46 |
47 | logger.info("Start initializing finetune node tasks")
48 | initialize_node(self.model_config.model_id, self.model_config.initialization.s3_mirror_config)
49 | logger.info(f"Start loading tokenizer for finetune {self.model_config.model_id}")
50 | # self.model_config.model_id = '/root/.cache/huggingface/hub/ZhipuAI/chatglm3-6b/'
51 | # self.model_config.model_id = '/data/hhwang/models/chatglm2-6b/'
52 | tokenizer = self.initializer.load_tokenizer(self.model_config.model_id)
53 | if self.model_config.add_special_tokens:
54 | add_special_tokens = self.model_config.add_special_tokens
55 | if add_special_tokens.get("pad_token"):
56 | tokenizer.pad_token = add_special_tokens.get("pad_token")
57 | if add_special_tokens.get("eos_token"):
58 | tokenizer.eos_token = add_special_tokens.get("eos_token")
59 | logger.info(f"Initialize {taskcls} and load dataset")
60 | # logger.info(f"Initialize {taskcls} and load dataset for model {self.model_config.model_id}")
61 | taskobj = taskcls.from_tokenizer(tokenizer, self.ftapp.ft_config)
62 | logger.info(f"Load model {self.model_config.model_id} by {taskobj.AUTO_MODEL_CLASS}")
63 | from_pretrained_kwargs = taskobj.FROM_PRETRAINED_KWARGS if taskobj.FROM_PRETRAINED_KWARGS else {}
64 | model = self.initializer.load_model(self.model_config.model_id, taskobj.AUTO_MODEL_CLASS, **from_pretrained_kwargs)
65 | if self.model_config.quantization_bit is not None:
66 | print(f"Quantized to {self.model_config.quantization_bit} bit")
67 | model = model.quantize(self.model_config.quantization_bit)
68 |
69 | taskobj.set_model(model)
70 |
71 | # preprocess_function = taskobj.get_data_proprocess()
72 | # compute_metrics_function = taskobj.get_compute_metrics()
73 | data_collator = taskobj.get_data_collator()
74 | # batch_encoder = BatchMapper(preprocess_function, batch_format="pandas")
75 |
76 | data_config = self.ftapp.ft_config.data_config
77 | use_gpu = True if torch.cuda.is_available() else False
78 | use_mps = True if torch.backends.mps.is_available() else False
79 | logger.info(f"use_gpu: {use_gpu}, use_cpu: {not use_gpu}, use_mps: {use_mps}")
80 |
81 | logger.info(f"Finetune get train and validation dataset")
82 | if data_config.num_row > 0:
83 | # only for test purpose
84 | train_dataset = taskobj.getSmallTrainDataSet(data_config.num_row)
85 | eval_dataset = taskobj.getSmallEvalDataSet(data_config.num_row)
86 | else:
87 | # For train
88 | train_dataset = taskobj.getTrainDataSet()
89 | eval_dataset = taskobj.getEvalDataSet()
90 |
91 | logger.info(f"Finetune train dataset {train_dataset}")
92 | logger.info(f"Finetune eval dataset {eval_dataset}")
93 |
94 | if hasattr(model, "is_parallelizable"):
95 | logger.info(f"model.is_parallelizable = {model.is_parallelizable}")
96 |
97 | if hasattr(model, "model_parallel"):
98 | logger.info(f"model.model_parallel = {model.model_parallel}")
99 |
100 | if getattr(model, "hf_device_map", None) is not None:
101 | logger.info(f"model.hf_device_map is {model.hf_device_map}")
102 |
103 | ftConfig = self.ftapp.ft_config.train_config.base_config
104 | model_name = self.model_config.model_id.split("/")[-1]
105 | task_name = self.ft_task
106 | outputDir = f"{ftConfig.checkpoints_output_dir}/{model_name}-finetuned-{task_name}-{data_config.data_path}-{data_config.subset}"
107 | logger.info(f"Finetune checkpoints output dir: {outputDir}")
108 | args = TrainingArguments(
109 | outputDir,
110 | evaluation_strategy=ftConfig.evaluation_strategy,
111 | save_strategy=ftConfig.save_strategy,
112 | logging_strategy=ftConfig.logging_strategy,
113 | logging_steps = 2,
114 | save_steps = ftConfig.save_steps,
115 | eval_steps = 2,
116 | learning_rate=ftConfig.learning_rate,
117 | per_device_train_batch_size=ftConfig.per_device_train_batch_size,
118 | per_device_eval_batch_size=ftConfig.per_device_eval_batch_size,
119 | num_train_epochs=ftConfig.num_train_epochs,
120 | weight_decay=ftConfig.weight_decay,
121 | push_to_hub=False,
122 | disable_tqdm=False, # declutter the output a little
123 | use_cpu=not use_gpu, # you need to explicitly set no_cuda if you want CPUs
124 | remove_unused_columns=ftConfig.remove_unused_columns,
125 | )
126 | trainConfig = self.ftapp.ft_config.train_config
127 | ftMethod = self.ftapp.ft_config.ft_method
128 | model = get_train_model(model, ftMethod, trainConfig)
129 | trainer = Trainer(
130 | # trainer = Seq2SeqTrainer(
131 | model,
132 | args,
133 | train_dataset=train_dataset,
134 | eval_dataset=eval_dataset,
135 | tokenizer=tokenizer,
136 | # compute_metrics=compute_metrics_function,
137 | data_collator=data_collator,
138 | )
139 | trainer.add_callback(CustomCallback(trainer))
140 | logger.info("Starting training")
141 | trainResult = trainer.train()
142 | logger.info(f"Train result {trainResult}")
143 | trainer.save_model()
144 | logger.info(f"Save model to {trainer.args.output_dir}")
145 | logger.info("Done training")
146 |
147 | # depend on ray for distribution
148 | # def trainV1(self):
149 | # taskobj: Task = None
150 | # task = parse_task_name(self.ftapp)
151 | # logger.info(f"TransformersFT.trainV1 finetune task name {task}")
152 | # taskcls = TASK_REGISTRY[task]
153 |
154 | # if not taskcls:
155 | # logger.error(f"Couldn't load defined task from register: {task}")
156 | # raise
157 |
158 | # logger.info("Starting initialize Finetune node tasks")
159 | # initialize_node(self.model_config.model_id, self.model_config.initialization.s3_mirror_config)
160 |
161 | # tokenizer = self.initializer.load_tokenizer(self.model_config.model_id)
162 | # logger.info("Done load tokenizer for finetune")
163 |
164 | # taskobj = taskcls.from_tokenizer(tokenizer, self.ftapp.ft_config)
165 |
166 | # from_pretrained_kwargs = taskobj.FROM_PRETRAINED_KWARGS if taskobj.FROM_PRETRAINED_KWARGS else {}
167 | # model = self.initializer.load_model(self.model_config.model_id, taskobj.AUTO_MODEL_CLASS, **from_pretrained_kwargs)
168 | # taskobj.set_model(model)
169 |
170 | # preprocess_function = taskobj.get_data_proprocess()
171 | # compute_metrics_function = taskobj.get_compute_metrics()
172 | # data_collator = taskobj.get_data_collator()
173 | # batch_encoder = BatchMapper(preprocess_function, batch_format="pandas")
174 |
175 | # ray_datasets = ray.data.from_huggingface(taskobj.get_dataset())
176 | # model_name = self.model_config.model_id.split("/")[-1]
177 | # task = self.ft_task
178 | # name = f"{model_name}-finetuned-{task}"
179 | # use_gpu = True if torch.cuda.is_available() else False
180 |
181 | # def trainer_init_per_worker(train_dataset, eval_dataset = None, **config):
182 | # print(f"Is CUDA available: {torch.cuda.is_available()}")
183 |
184 | # args = TrainingArguments(
185 | # name,
186 | # evaluation_strategy=config.get("evaluation_strategy", "epoch"),
187 | # save_strategy=config.get("save_strategy", "epoch"),
188 | # logging_strategy=config.get("logging_strategy", "epoch"),
189 | # logging_steps = 2,
190 | # save_steps = 500,
191 | # eval_steps = 2,
192 | # learning_rate=config.get("learning_rate", 2e-5),
193 | # per_device_train_batch_size=config.get("per_device_train_batch_size", 16),
194 | # per_device_eval_batch_size=config.get("per_device_train_batch_size", 16),
195 | # num_train_epochs=config.get("epochs", 2),
196 | # weight_decay=config.get("weight_decay", 0.01),
197 | # push_to_hub=False,
198 | # disable_tqdm=False, # declutter the output a little
199 | # no_cuda=not use_gpu, # you need to explicitly set no_cuda if you want CPUs
200 | # remove_unused_columns=config.get("remove_unused_columns", True),
201 | # fp16=True,
202 | # )
203 |
204 | # trainer = Trainer(
205 | # model,
206 | # args,
207 | # train_dataset=train_dataset,
208 | # eval_dataset=eval_dataset,
209 | # tokenizer=tokenizer,
210 | # compute_metrics=compute_metrics_function,
211 | # data_collator=data_collator,
212 | # )
213 | # trainer.add_callback(CustomCallback(trainer))
214 | # print("Starting training")
215 |
216 | # return trainer
217 |
218 | # trainer = TransformersTrainer(
219 | # trainer_init_per_worker=trainer_init_per_worker,
220 | # trainer_init_config = self.train_conf.get_train_kwargs(),
221 | # scaling_config=self.scale_config.as_air_scaling_config(),
222 | # datasets={
223 | # "train": ray_datasets[taskobj.training_key()],
224 | # "evaluation": ray_datasets[taskobj.validation_key()],
225 | # },
226 | # run_config=RunConfig(
227 | # # callbacks=[MLflowLoggerCallback(experiment_name=name)],
228 | # checkpoint_config=CheckpointConfig(
229 | # num_to_keep=1,
230 | # checkpoint_score_attribute="eval_loss",
231 | # checkpoint_score_order="min",
232 | # ),
233 | # ),
234 | # preprocessor=batch_encoder,
235 | # )
236 |
237 | # result = trainer.fit()
238 | # print(result)
239 | # checkpoint = TransformersCheckpoint.from_checkpoint(result.checkpoint)
240 | # hf_trainer = checkpoint.get_model(model=taskobj.AUTO_MODEL_CLASS)
241 | # hf_trainer.save_pretrained(CHECKPOINT_PATH)
242 | # tokenizer.save_pretrained(CHECKPOINT_PATH)
243 |
244 | # print("Done")
245 |
246 |
247 |
248 |
249 |
--------------------------------------------------------------------------------