├── llmadmin ├── api │ ├── __init__.py │ ├── env.py │ ├── sdk.py │ └── cli.py ├── common │ ├── __init__.py │ ├── llm_event.py │ ├── evaluation.py │ └── backend.py ├── backend │ ├── llm │ │ ├── __init__.py │ │ ├── ft │ │ │ ├── const.py │ │ │ ├── methods │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── lora.py │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ ├── tasks │ │ │ │ ├── __init__.py │ │ │ │ ├── sequenceclassification_glue_cola.py │ │ │ │ ├── sequenceclassification_glue_mrpc.py │ │ │ │ ├── sequenceclassification_yelp_review_full.py │ │ │ │ ├── _base.py │ │ │ │ ├── maskedlm_imdb.py │ │ │ │ ├── tokenclassification_conll2003.py │ │ │ │ ├── noheader_AdvertiseGen.py │ │ │ │ └── text_generation_AdvertiseGen.py │ │ │ ├── callback.py │ │ │ ├── _base.py │ │ │ ├── test │ │ │ │ └── test_seq_cls_bert_yelp.py │ │ │ ├── ray_train.py │ │ │ └── transformer.py │ │ ├── pipelines │ │ │ ├── llamacpp │ │ │ │ ├── __init__.py │ │ │ │ ├── processors.py │ │ │ │ └── llamacpp_pipeline.py │ │ │ ├── __init__.py │ │ │ ├── processors.py │ │ │ ├── utils.py │ │ │ ├── default_pipeline.py │ │ │ └── default_transformers_pipeline.py │ │ ├── initializers │ │ │ ├── hf_transformers │ │ │ │ ├── __init__.py │ │ │ │ └── deepspeed.py │ │ │ ├── __init__.py │ │ │ ├── _base.py │ │ │ └── llamacpp.py │ │ └── utils.py │ ├── server │ │ ├── __init__.py │ │ ├── exceptions.py │ │ ├── _batch.py │ │ ├── run.py │ │ └── config.py │ └── logger.py ├── frontend │ ├── __init__.py │ ├── mongo_secrets.py │ ├── javascript_loader.py │ ├── app.py │ ├── javascript │ │ └── llmadmin.js │ ├── utils.py │ ├── mongo_logger.py │ └── leaderboard.py └── __init__.py ├── docs └── llm-finetune.png ├── MANIFEST.in ├── pyproject.toml ├── dataset └── glue │ └── mrpc │ └── 1.0.0 │ ├── test-00000-of-00001.parquet │ ├── train-00000-of-00001.parquet │ └── validation-00000-of-00001.parquet ├── llm_finetune.py ├── requirements.txt ├── models ├── ft--sequenceclassification--bert-base-uncased.yaml ├── ft--text-generation--Qwen-Qwen-7B-Chat.yaml ├── ft--text-generation--THUDM-chatglm2-6b.yaml ├── ft--text-generation--Qwen-Qwen-7B.yaml ├── ft--sequenceclassification--bert-base-uncased-lora.yaml ├── ft--maskedlm--distilbert-base-uncased.yaml └── ft--text-generation--THUDM-chatglm3-6b.yaml ├── setup.py ├── llm_finetune_ray.py ├── .gitignore ├── README.md └── LICENSE /llmadmin/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmadmin/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmadmin/frontend/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmadmin/backend/server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llmadmin/__init__.py: -------------------------------------------------------------------------------- 1 | from llmadmin.api.sdk import * 2 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/const.py: -------------------------------------------------------------------------------- 1 | CHECKPOINT_PATH = "./fintuned/" -------------------------------------------------------------------------------- /llmadmin/backend/server/exceptions.py: -------------------------------------------------------------------------------- 1 | class PromptTooLongError(ValueError): 2 | pass 3 | -------------------------------------------------------------------------------- /docs/llm-finetune.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/docs/llm-finetune.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE *.sh 2 | recursive-include tests *.py 3 | recursive-include models *.yaml 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | select = ["E", "F", "I", "ASYNC", "B"] 3 | line-length = 300 4 | ignore = ["F403", "B905"] -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/methods/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import get_train_model 2 | 3 | __all__ = [ 4 | "get_train_model" 5 | ] 6 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/pipelines/llamacpp/__init__.py: -------------------------------------------------------------------------------- 1 | from .llamacpp_pipeline import LlamaCppPipeline 2 | 3 | __all__ = ["LlamaCppPipeline"] -------------------------------------------------------------------------------- /dataset/glue/mrpc/1.0.0/test-00000-of-00001.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/dataset/glue/mrpc/1.0.0/test-00000-of-00001.parquet -------------------------------------------------------------------------------- /dataset/glue/mrpc/1.0.0/train-00000-of-00001.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/dataset/glue/mrpc/1.0.0/train-00000-of-00001.parquet -------------------------------------------------------------------------------- /dataset/glue/mrpc/1.0.0/validation-00000-of-00001.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/dataset/glue/mrpc/1.0.0/validation-00000-of-00001.parquet -------------------------------------------------------------------------------- /llm_finetune.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from llmadmin.api.cli import app 4 | if __name__ == '__main__': 5 | sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) 6 | sys.exit(app()) 7 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from ._base import BaseFT 4 | from .transformer import TransformersFT 5 | from .ray_train import RayTrain 6 | 7 | 8 | __all__ = [ 9 | "TransformersFT", "RayTrain" 10 | ] 11 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/methods/base.py: -------------------------------------------------------------------------------- 1 | from .lora import lora_model 2 | from llmadmin.backend.logger import get_logger 3 | 4 | logger = get_logger(__name__) 5 | 6 | def get_train_model(model, ft_method, trainConfig): 7 | if ft_method == "lora": 8 | lora_config = trainConfig.lora_config 9 | model = lora_model(model, lora_config) 10 | return model 11 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/utils.py: -------------------------------------------------------------------------------- 1 | from llmadmin.backend.server.models import FTApp 2 | 3 | def parse_task_name(ftapp: FTApp): 4 | task_purpose = (ftapp.ft_config.ft_task + "-") if ftapp.ft_config.ft_task else "" 5 | data_path = ftapp.ft_config.data_config.data_path 6 | data_name = ("-" + ftapp.ft_config.data_config.subset) if ftapp.ft_config.data_config.subset else "" 7 | 8 | return task_purpose + data_path + data_name 9 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/initializers/hf_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import DeviceMapInitializer, SingleDeviceInitializer, TransformersInitializer, FinetuneInitializer, AutoModelInitializer, TransformersPipelineInitializer 2 | 3 | __all__ = [ 4 | "DeviceMapInitializer", 5 | "SingleDeviceInitializer", 6 | "TransformersInitializer", 7 | "FinetuneInitializer", 8 | "TransformersPipelineInitializer", 9 | "AutoModelInitializer", 10 | ] 11 | -------------------------------------------------------------------------------- /llmadmin/backend/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Optional 4 | 5 | LOG_FORMAT = ( 6 | "[%(levelname)s %(asctime)s]{rank} %(filename)s: %(lineno)d " "%(message)s" 7 | ) 8 | 9 | 10 | def get_logger(name: str = None, rank: Optional[int] = None, **kwargs): 11 | if rank is None: 12 | rank = int(os.environ.get("RANK", -1)) 13 | logger = logging.getLogger(name) 14 | level = logging.ERROR if rank > 0 else logging.INFO 15 | log_format = LOG_FORMAT.format(rank=f"[Rank {rank}]" if rank > -1 else "") 16 | logging.basicConfig(level=level, format=log_format, **kwargs) 17 | return logger 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | async_timeout==4.0.3 2 | boto3==1.34.54 3 | datasets==2.18.0 4 | evaluate==0.4.1 5 | fastapi==0.100.1 6 | filelock==3.13.1 7 | gradio==3.39.0 8 | huggingface_hub==0.21.3 9 | jieba==0.42.1 10 | mdit_py_plugins==0.3.3 11 | nltk==3.8.1 12 | numpy==1.26.4 13 | optimum==1.17.1 14 | pandas==2.2.1 15 | peft==0.9.0 16 | pydantic==1.10.9 17 | pymongo==4.6.2 18 | PyYAML==6.0.1 19 | Requests==2.31.0 20 | rich==13.7.1 21 | rouge_chinese==1.0.3 22 | torch==2.1.2 23 | transformers==4.33.0 24 | typer==0.9.0 25 | typing_extensions==4.10.0 26 | socksio==1.0.0 27 | scipy==1.11.1 28 | einops 29 | transformers_stream_generator 30 | tiktoken 31 | cpm_kernels 32 | ray[serve]==2.20.0 33 | ray[train]==2.20.0 34 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from ._base import BasePipeline 4 | from .default_pipeline import DefaultPipeline 5 | from .default_transformers_pipeline import DefaultTransformersPipeline 6 | from .llamacpp import LlamaCppPipeline 7 | 8 | 9 | def get_pipeline_cls_by_name(name: str) -> Type[BasePipeline]: 10 | lowercase_globals = {k.lower(): v for k, v in globals().items()} 11 | ret = lowercase_globals.get( 12 | f"{name.lower()}pipeline", lowercase_globals.get(name.lower(), None) 13 | ) 14 | assert ret 15 | return ret 16 | 17 | 18 | __all__ = [ 19 | "get_pipeline_cls_by_name", 20 | "DefaultPipeline", 21 | "DefaultTransformersPipeline", 22 | "LlamaCppPipeline", 23 | ] 24 | -------------------------------------------------------------------------------- /llmadmin/api/env.py: -------------------------------------------------------------------------------- 1 | def has_ray(): 2 | try: 3 | import ray # noqa: F401 4 | 5 | return True 6 | except ImportError: 7 | return False 8 | 9 | 10 | def has_backend(): 11 | try: 12 | import llmadmin.backend # noqa: F401 13 | 14 | return True 15 | except ImportError: 16 | return True 17 | 18 | 19 | def assert_has_ray(): 20 | assert has_ray(), ( 21 | "This command requires ray to be installed. " 22 | "Please install ray with `pip install 'ray[default]'`" 23 | ) 24 | 25 | 26 | def assert_has_backend(): 27 | assert has_backend(), ( 28 | "This command requires llmadmin backend to be installed. " 29 | "Please install backend dependencies with `pip install llmadmin[backend]`. " 30 | ) 31 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/pipelines/llamacpp/processors.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | from llama_cpp import LogitsProcessor, StoppingCriteria 5 | from transformers import MaxTimeCriteria, MinNewTokensLengthLogitsProcessor 6 | 7 | from llmadmin.backend.logger import get_logger 8 | 9 | logger = get_logger(__name__) 10 | 11 | 12 | class LlamaCppMinNewTokensLengthLogitsProcessor( 13 | MinNewTokensLengthLogitsProcessor, LogitsProcessor 14 | ): 15 | def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: 16 | scores = MinNewTokensLengthLogitsProcessor.__call__( 17 | self, torch.LongTensor(input_ids), torch.FloatTensor(scores)[None, :] 18 | ) 19 | return scores[0].tolist() 20 | 21 | 22 | class LlamaMaxTimeCriteria(MaxTimeCriteria, StoppingCriteria): 23 | pass -------------------------------------------------------------------------------- /llmadmin/backend/llm/initializers/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Type 2 | 3 | from .hf_transformers import ( 4 | DeviceMapInitializer, 5 | SingleDeviceInitializer, 6 | FinetuneInitializer, 7 | TransformersPipelineInitializer, 8 | AutoModelInitializer, 9 | ) 10 | 11 | if TYPE_CHECKING: 12 | from ._base import LLMInitializer 13 | 14 | from .llamacpp import LlamaCppInitializer 15 | 16 | 17 | def get_initializer_cls_by_name(name: str) -> Type["LLMInitializer"]: 18 | lowercase_globals = {k.lower(): v for k, v in globals().items()} 19 | ret = lowercase_globals.get( 20 | f"{name.lower()}initializer", lowercase_globals.get(name.lower(), None) 21 | ) 22 | assert ret 23 | return ret 24 | 25 | 26 | __all__ = [ 27 | "get_initializer_cls_by_name", 28 | "DeviceMapInitializer", 29 | "SingleDeviceInitializer", 30 | "FinetuneInitializer", 31 | "AutoModelInitializer", 32 | "LlamaCppInitializer", 33 | "TransformersPipelineInitializer", 34 | ] 35 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from . import sequenceclassification_glue_cola 2 | from . import sequenceclassification_glue_mrpc 3 | from . import tokenclassification_conll2003 4 | from . import noheader_AdvertiseGen 5 | from . import text_generation_AdvertiseGen 6 | from . import maskedlm_imdb 7 | from . import sequenceclassification_yelp_review_full 8 | 9 | TASK_REGISTRY = { 10 | "sequenceclassification-glue-cola": sequenceclassification_glue_cola.SequenceclassificationGlueCola, 11 | "sequenceclassification-glue-mrpc": sequenceclassification_glue_mrpc.SequenceclassificationGlueMrpc, 12 | "tokenclassification-conll2003": tokenclassification_conll2003.TokenclassificationConll2003, 13 | "noheader-AdvertiseGen": noheader_AdvertiseGen.NoheaderAdvertiseGen, 14 | "text-generation-AdvertiseGen": text_generation_AdvertiseGen.NoheaderAdvertiseGen, 15 | "maskedlm-imdb": maskedlm_imdb.MaskedLMImdb, 16 | "sequenceclassification-yelp_review_full": sequenceclassification_yelp_review_full.SequenceclassificationYelpReviewFull 17 | } -------------------------------------------------------------------------------- /models/ft--sequenceclassification--bert-base-uncased.yaml: -------------------------------------------------------------------------------- 1 | model_config: 2 | warmup: True 3 | model_task: fill-mask 4 | model_id: bert-base-uncased 5 | max_input_words: 800 6 | initialization: 7 | initializer: 8 | type: Finetune 9 | dtype: float32 10 | from_pretrained_kwargs: 11 | trust_remote_code: true 12 | ft_config: 13 | ft_task: "sequenceclassification" 14 | data_config: 15 | data_path: glue 16 | subset: mrpc 17 | local_path: dataset/glue/mrpc/1.0.0 18 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data 19 | # train_file: 20 | # validation_file: 21 | input_columns: 22 | - "sentence" 23 | validation_column: validation 24 | # labels 25 | train_config: 26 | base_config: 27 | checkpoints_output_dir: finetune_models/ 28 | per_device_train_batch_size: 8 29 | learning_rate: 2e-5 30 | num_train_epochs: 2 31 | weight_decay: 0.01 32 | logging_strategy: steps 33 | evaluation_strategy: steps 34 | save_strategy: steps 35 | save_steps: 100 36 | -------------------------------------------------------------------------------- /models/ft--text-generation--Qwen-Qwen-7B-Chat.yaml: -------------------------------------------------------------------------------- 1 | model_config: 2 | warmup: True 3 | model_task: text-generation 4 | model_id: Qwen/Qwen-7B-Chat 5 | max_input_words: 800 6 | initialization: 7 | initializer: 8 | type: Finetune 9 | dtype: float32 10 | from_pretrained_kwargs: 11 | trust_remote_code: true 12 | ft_config: 13 | ft_task: "text-generation" 14 | data_config: 15 | data_path: AdvertiseGen 16 | local_path: dataset/AdvertiseGen 17 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data 18 | input_columns: 19 | - "content" 20 | validation_column: summary 21 | train_config: 22 | base_config: 23 | max_length: 500 24 | checkpoints_output_dir: /tmp/finetune 25 | per_device_train_batch_size: 1 26 | per_device_eval_batch_size: 1 27 | learning_rate: 2e-5 28 | num_train_epochs: 2 29 | weight_decay: 0.01 30 | remove_unused_columns: true 31 | logging_strategy: steps 32 | evaluation_strategy: steps 33 | save_strategy: steps 34 | save_steps: 25 35 | max_steps: 50 36 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/callback.py: -------------------------------------------------------------------------------- 1 | from transformers import TrainerCallback, TrainerState, TrainerControl, TrainingArguments 2 | import threading 3 | import queue 4 | 5 | QUEUE = queue.Queue() 6 | def send_metrics(): 7 | while True: 8 | item = QUEUE.get() 9 | print("============") 10 | print(item) 11 | QUEUE.task_done() 12 | 13 | threading.Thread(target=send_metrics, daemon=True).start() 14 | 15 | class CustomCallback(TrainerCallback): 16 | """ 17 | Overriding the trainer callback to be able to compute training accuracy as well 18 | Example taken from: 19 | https://stackoverflow.com/questions/67457480/how-to-get-the-accuracy-per-epoch-or-step-for-the-huggingface-transformers-train 20 | """ 21 | METRICS_FILE = "./metrics" 22 | 23 | def __init__(self, trainer) -> None: 24 | super().__init__() 25 | self._trainer = trainer 26 | 27 | def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): 28 | if control.should_log: 29 | if len(state.log_history) != 0: 30 | QUEUE.put(state.log_history[-1]) 31 | return control -------------------------------------------------------------------------------- /models/ft--text-generation--THUDM-chatglm2-6b.yaml: -------------------------------------------------------------------------------- 1 | model_config: 2 | warmup: True 3 | model_task: text-generation 4 | model_id: THUDM/chatglm2-6b 5 | max_input_words: 800 6 | quantization_bit: 4 7 | initialization: 8 | initializer: 9 | type: Finetune 10 | dtype: float32 11 | from_pretrained_kwargs: 12 | trust_remote_code: true 13 | # load_in_8bit: True 14 | ft_config: 15 | ft_task: "text-generation" 16 | data_config: 17 | data_path: AdvertiseGen 18 | local_path: dataset/AdvertiseGen 19 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data 20 | input_columns: 21 | - "content" 22 | validation_column: summary 23 | train_config: 24 | base_config: 25 | max_length: 500 26 | checkpoints_output_dir: /tmp/finetune 27 | per_device_train_batch_size: 1 28 | per_device_eval_batch_size: 1 29 | learning_rate: 2e-5 30 | num_train_epochs: 2 31 | weight_decay: 0.01 32 | remove_unused_columns: true 33 | logging_strategy: steps 34 | evaluation_strategy: steps 35 | save_strategy: steps 36 | save_steps: 25 37 | max_steps: 50 38 | -------------------------------------------------------------------------------- /models/ft--text-generation--Qwen-Qwen-7B.yaml: -------------------------------------------------------------------------------- 1 | model_config: 2 | warmup: True 3 | model_task: text-generation 4 | model_id: Qwen/Qwen-7B 5 | max_input_words: 800 6 | initialization: 7 | initializer: 8 | type: Finetune 9 | dtype: float32 10 | from_pretrained_kwargs: 11 | trust_remote_code: true 12 | add_special_tokens: 13 | pad_token: "<|extra_0|>" 14 | eos_token: "<|endoftext|>" 15 | ft_config: 16 | ft_task: "text-generation" 17 | data_config: 18 | data_path: AdvertiseGen 19 | local_path: dataset/AdvertiseGen 20 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data 21 | input_columns: 22 | - "content" 23 | validation_column: summary 24 | train_config: 25 | base_config: 26 | max_length: 500 27 | checkpoints_output_dir: /tmp/finetune 28 | per_device_train_batch_size: 1 29 | per_device_eval_batch_size: 1 30 | learning_rate: 2e-5 31 | num_train_epochs: 2 32 | weight_decay: 0.01 33 | remove_unused_columns: true 34 | logging_strategy: steps 35 | evaluation_strategy: steps 36 | save_strategy: steps 37 | save_steps: 25 38 | max_steps: 50 39 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/methods/lora.py: -------------------------------------------------------------------------------- 1 | from peft import get_peft_model 2 | from llmadmin.backend.logger import get_logger 3 | 4 | logger = get_logger(__name__) 5 | 6 | def get_trainable_parameters(model): 7 | """ 8 | get the number of trainable parameters in the model. 9 | """ 10 | trainable_params = 0 11 | all_param = 0 12 | for _, param in model.named_parameters(): 13 | all_param += param.numel() 14 | if param.requires_grad: 15 | trainable_params += param.numel() 16 | logger.info( 17 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" 18 | ) 19 | 20 | def lora_model(model, lora_config): 21 | logger.info("Load lora config") 22 | logger.info(lora_config) 23 | # from peft import LoraConfig, TaskType 24 | # lora_config = LoraConfig( 25 | # task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1 26 | # ) 27 | # logger.info(lora_config) 28 | lora_config.loftq_config = {} 29 | logger.info("Using peft to avoid Catastrophic Forgetting") 30 | model = get_peft_model(model, lora_config) 31 | get_trainable_parameters(model) 32 | return model 33 | -------------------------------------------------------------------------------- /llmadmin/frontend/mongo_secrets.py: -------------------------------------------------------------------------------- 1 | # Use this code snippet in your app. 2 | # If you need more information about configurations 3 | # or implementing the sample code, visit the AWS docs: 4 | # https://aws.amazon.com/developer/language/python/ 5 | 6 | import json 7 | import logging 8 | import os 9 | 10 | import boto3 11 | 12 | 13 | def get_mongo_secret_url(): 14 | mongo_url = os.getenv("MONGODB_URL") 15 | if mongo_url: 16 | return mongo_url 17 | try: 18 | secret_name = "prod/frontend/mongo_password" 19 | region_name = "us-west-2" 20 | 21 | # Create a Secrets Manager client 22 | session = boto3.session.Session() 23 | client = session.client(service_name="secretsmanager", region_name=region_name) 24 | 25 | get_secret_value_response = client.get_secret_value(SecretId=secret_name) 26 | 27 | # Decrypts secret using the associated KMS key. 28 | secret = get_secret_value_response["SecretString"] 29 | 30 | secret_dict = json.loads(secret) 31 | mongo_url = secret_dict.get("url") 32 | return mongo_url 33 | except Exception as e: 34 | # Fail quietly if we can't get the secret 35 | logging.warning(f"Failed to retrieve mongo secret, Exception: {e}") 36 | -------------------------------------------------------------------------------- /llmadmin/common/llm_event.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from enum import Enum 3 | from typing import Dict, List, Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | 8 | class Flag(Enum): 9 | HATE = "hate" 10 | OBSCENE = "obscene" 11 | WRONG_LANGUAGE = "wrong-language" 12 | NONFACTUAL = "non-factual" 13 | 14 | 15 | class Vote(BaseModel): 16 | llm: str 17 | score: float 18 | 19 | 20 | class LlmResponse(BaseModel): 21 | model_id: str 22 | text: str 23 | model_config: Optional[Dict] 24 | gen_stats: Optional[Dict] 25 | 26 | 27 | class LlmEvent(BaseModel): 28 | created_at: datetime 29 | # Name of the project 30 | project_name: str 31 | 32 | # Identifier for a session 33 | session_id: Optional[str] 34 | 35 | # unique string representing this event 36 | instance_id: str 37 | 38 | # Prompt given by the user 39 | user_prompt: str 40 | responses: List[LlmResponse] 41 | 42 | # Vote is a dictionary by llm and the votes 43 | # that model got. Typically, this is 1. 44 | votes: Optional[List[Vote]] 45 | vote_comments: Optional[Dict[str, str]] 46 | 47 | # Key: llm 48 | # Value: list of flags 49 | flag: Optional[Dict[str, List[Flag]]] 50 | 51 | # Key: llm 52 | # Value: Comment for each llm 53 | flag_comments: Optional[Dict[str, str]] 54 | -------------------------------------------------------------------------------- /models/ft--sequenceclassification--bert-base-uncased-lora.yaml: -------------------------------------------------------------------------------- 1 | model_config: 2 | warmup: True 3 | model_task: fill-mask 4 | model_id: bert-base-uncased 5 | initialization: 6 | initializer: 7 | type: Finetune 8 | dtype: float32 9 | from_pretrained_kwargs: 10 | trust_remote_code: true 11 | ft_config: 12 | # ft_stage: "sft" 13 | ft_method: "lora" 14 | ft_task: "sequenceclassification" 15 | data_config: 16 | data_path: glue 17 | subset: mrpc 18 | local_path: dataset/glue/mrpc/1.0.0 19 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data 20 | input_columns: 21 | - "sentence" 22 | validation_column: validation 23 | train_config: 24 | lora_config: 25 | r: 1 # Lora attention dimension 26 | task_type: SEQ_CLS #SEQ_CLS, SEQ_2_SEQ_LM, CAUSAL_LM, TOKEN_CLS, QUESTION_ANS, FEATURE_EXTRACTION 27 | lora_alpha: 1 # The alpha parameter for Lora scaling 28 | lora_dropout: 0.1 # The dropout probability for Lora layers 29 | base_config: 30 | checkpoints_output_dir: finetune_models/ 31 | per_device_train_batch_size: 8 32 | learning_rate: 2e-5 33 | num_train_epochs: 2 34 | weight_decay: 0.01 35 | logging_strategy: steps 36 | evaluation_strategy: steps 37 | save_strategy: steps 38 | save_steps: 100 39 | -------------------------------------------------------------------------------- /models/ft--maskedlm--distilbert-base-uncased.yaml: -------------------------------------------------------------------------------- 1 | model_config: 2 | warmup: True 3 | model_task: fill-mask 4 | model_id: distilbert-base-uncased 5 | max_input_words: 800 6 | initialization: 7 | runtime_env: 8 | pip: 9 | - deepspeed==0.9.2 10 | - accelerate 11 | s3_mirror_config: 12 | bucket_uri: /tmp/hub/models/distilbert-base-uncased/ 13 | # bucket_uri: s3://large-dl-models-mirror/models--amazon--LightGPT/main-safetensors/ 14 | initializer: 15 | type: Finetune 16 | dtype: float32 17 | from_pretrained_kwargs: 18 | # use_cache: true 19 | trust_remote_code: true 20 | # use_kernel: true # for deepspped type only 21 | # max_tokens: 1536 # for deepspped type only 22 | ft_config: 23 | ft_task: maskedlm 24 | data_config: 25 | data_path: imdb 26 | subset: 27 | local_path: /tmp/hub/dataset/imdb/plain_text/1.0.0 28 | num_row: 30 29 | # train_file: 30 | # validation_file: 31 | input_columns: 32 | - "sentence" 33 | validation_column: validation 34 | # labels 35 | train_config: 36 | base_config: 37 | checkpoints_output_dir: /tmp/finetune 38 | per_device_train_batch_size: 32 39 | learning_rate: 2e-5 40 | num_train_epochs: 2 41 | weight_decay: 0.01 42 | remove_unused_columns: false 43 | logging_strategy: steps 44 | evaluation_strategy: steps 45 | save_strategy: steps 46 | save_steps: 100 47 | scaling_config: 48 | num_workers: 7 49 | num_gpus_per_worker: 0 50 | num_cpus_per_worker: 1 # for infrence 51 | # resources_per_worker: 52 | # accelerator_type_cpu: 0.01 53 | ray_actor_options: 54 | num_cpus: 0.1 55 | -------------------------------------------------------------------------------- /models/ft--text-generation--THUDM-chatglm3-6b.yaml: -------------------------------------------------------------------------------- 1 | model_config: 2 | warmup: True 3 | model_task: text-generation 4 | model_id: THUDM/chatglm3-6b 5 | max_input_words: 800 6 | quantization_bit: 4 7 | initialization: 8 | # s3_mirror_config: 9 | # endpoint_url: http://39.107.108.170:9000 # Optinal for custom S3 storage endpoint url 10 | # bucket_uri: s3://opt-125m/facemodel/ # Must include hash file with commit id in repo 11 | # bucket_uri: /root/.cache/hub/ZhipuAI/chatglm3-6b/ # Local path of model with hash file 12 | initializer: 13 | type: Finetune 14 | dtype: float32 15 | from_pretrained_kwargs: 16 | trust_remote_code: true 17 | ft_config: 18 | ft_task: "text-generation" 19 | ft_method: "lora" 20 | data_config: 21 | data_path: AdvertiseGen 22 | local_path: dataset/AdvertiseGen 23 | num_row: 30 # 0: Train with all data. >0: Test with $num_row data 24 | input_columns: 25 | - "content" 26 | validation_column: summary 27 | train_config: 28 | lora_config: 29 | r: 1 # Lora attention dimension 30 | task_type: CAUSAL_LM #SEQ_CLS, SEQ_2_SEQ_LM, CAUSAL_LM, TOKEN_CLS, QUESTION_ANS, FEATURE_EXTRACTION 31 | lora_alpha: 1 # The alpha parameter for Lora scaling 32 | lora_dropout: 0.1 # The dropout probability for Lora layers 33 | base_config: 34 | max_length: 500 35 | checkpoints_output_dir: /tmp/finetune 36 | per_device_train_batch_size: 1 37 | per_device_eval_batch_size: 1 38 | learning_rate: 2e-5 39 | num_train_epochs: 2 40 | weight_decay: 0.01 41 | remove_unused_columns: true 42 | logging_strategy: steps 43 | evaluation_strategy: steps 44 | save_strategy: steps 45 | save_steps: 25 46 | max_steps: 50 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import find_packages, setup 3 | this_directory = os.path.abspath(os.path.dirname(__file__)) 4 | with open(os.path.join(this_directory, "requirements.txt"), encoding="utf-8") as f: 5 | INSTALL_REQUIRES = f.read().splitlines() 6 | 7 | EXTRAS_REQUIRE = { 8 | "dev": INSTALL_REQUIRES + [ 9 | "pre-commit", 10 | "ruff==0.0.270", 11 | "black==23.3.0", 12 | ], 13 | "test": INSTALL_REQUIRES + [ 14 | "pytest", 15 | ], 16 | "docs": INSTALL_REQUIRES + [ 17 | "mkdocs-material", 18 | ], 19 | } 20 | 21 | setup( 22 | name="llmfinetune", 23 | version="0.0.1", 24 | description="A framework to finetune LLMs", 25 | long_description=open("README.md", "r", encoding="utf-8").read(), 26 | long_description_content_type="text/markdown", 27 | packages=find_packages(include="llmadmin*"), 28 | keywords=["ChatGLM", "BaiChuan", "LLaMA", "BLOOM", "Falcon", 29 | "LLM", "ChatGPT", "transformer", "pytorch", "deep learning"], 30 | include_package_data=True, 31 | package_data={"llmadmin": ["models/*"]}, 32 | entry_points={ 33 | "console_scripts": [ 34 | "llmfinetune=llmadmin.api.cli:app", 35 | ] 36 | }, 37 | extras_require=EXTRAS_REQUIRE, 38 | install_requires=INSTALL_REQUIRES, 39 | python_requires=">=3.8", 40 | classifiers=[ 41 | "Development Status :: 3 - Alpha", 42 | "Intended Audience :: Developers", 43 | "Intended Audience :: Education", 44 | "Intended Audience :: Science/Research", 45 | "License :: OSI Approved :: Apache Software License", 46 | "Operating System :: OS Independent", 47 | "Programming Language :: Python :: 3.8", 48 | "Programming Language :: Python :: 3.9", 49 | "Programming Language :: Python :: 3.10", 50 | "Programming Language :: Python :: 3.11", 51 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 52 | ] 53 | ) 54 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from llmadmin.backend.logger import get_logger 3 | from datasets import DatasetDict, Dataset, IterableDatasetDict, IterableDataset 4 | from typing import Union, TYPE_CHECKING, List 5 | from transformers import PreTrainedModel, PreTrainedTokenizer 6 | from llmadmin.backend.server.models import FTApp 7 | import torch 8 | from llmadmin.backend.llm.initializers import get_initializer_cls_by_name 9 | 10 | if TYPE_CHECKING: 11 | from ..initializers._base import LLMInitializer 12 | 13 | logger = get_logger(__name__) 14 | 15 | class BaseFT(ABC): 16 | """base fine tune class. 17 | 18 | Args: 19 | """ 20 | 21 | def __init__( 22 | self, 23 | ftapp: FTApp, 24 | ) -> None: 25 | self.ftapp = ftapp 26 | self.data_conf = ftapp.ft_config.data_config 27 | self.train_conf = ftapp.ft_config.train_config.base_config 28 | self.model_config = ftapp.model_config 29 | self.ft_task = ftapp.ft_config.ft_task 30 | self.scale_config = ftapp.scaling_config 31 | 32 | # Lazy import so that the new cache location is used 33 | torch.backends.cuda.matmul.allow_tf32 = True 34 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 35 | 36 | initializer_name = self.model_config.initialization.initializer 37 | if not isinstance(initializer_name, str): 38 | initializer_name = initializer_name.type 39 | 40 | logger.info(f"Finetune initializer name '{initializer_name}' on device {device}") 41 | initializer = get_initializer_cls_by_name(initializer_name)( 42 | device=device, 43 | world_size=1, # fake 44 | **self.model_config.initialization.initializer.get_initializer_kwargs(), 45 | ) 46 | 47 | self.initializer = initializer 48 | 49 | @abstractmethod 50 | def train(self): 51 | pass 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/test/test_seq_cls_bert_yelp.py: -------------------------------------------------------------------------------- 1 | # Adapted from Hugging Face tutorial: https://huggingface.co/docs/transformers/training 2 | 3 | import numpy as np 4 | import evaluate 5 | from datasets import load_dataset 6 | from transformers import ( 7 | Trainer, 8 | TrainingArguments, 9 | AutoTokenizer, 10 | AutoModelForSequenceClassification, 11 | ) 12 | 13 | num_labels = 5 14 | modelPath = "bert-base-cased" 15 | modelPath = "/Users/hub/models/bert-base-cased" 16 | dsPath = "yelp_review_full" 17 | dsPath = "/Users/hub/models/yelp_review_full/1.0.0" 18 | 19 | # Datasets 20 | dataset = load_dataset(dsPath) 21 | print('Loaded dataset', dataset) 22 | 23 | tokenizer = AutoTokenizer.from_pretrained(modelPath) 24 | 25 | def tokenize_function(examples): 26 | return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) 27 | 28 | count = 10 29 | small_train_dataset = dataset["train"].select(range(count)).map(tokenize_function, batched=True) 30 | small_eval_dataset = dataset["test"].select(range(count)).map(tokenize_function, batched=True) 31 | print('small train dataset', small_train_dataset) 32 | print('small eval dataset', small_eval_dataset) 33 | 34 | # Model 35 | model = AutoModelForSequenceClassification.from_pretrained(modelPath, num_labels=num_labels) 36 | 37 | # Metrics 38 | metric = evaluate.load("accuracy") 39 | 40 | def compute_metrics(eval_pred): 41 | logits, labels = eval_pred 42 | predictions = np.argmax(logits, axis=-1) 43 | return metric.compute(predictions=predictions, references=labels) 44 | 45 | # Hugging Face Trainer 46 | training_args = TrainingArguments( 47 | output_dir="test_trainer", 48 | evaluation_strategy="epoch", 49 | report_to="none" 50 | ) 51 | 52 | trainer = Trainer( 53 | model=model, 54 | args=training_args, 55 | train_dataset=small_train_dataset, 56 | eval_dataset=small_eval_dataset, 57 | compute_metrics=compute_metrics, 58 | ) 59 | 60 | # Start Training 61 | trainer.train() 62 | -------------------------------------------------------------------------------- /llmadmin/frontend/javascript_loader.py: -------------------------------------------------------------------------------- 1 | # https://github.com/gradio-app/gradio/discussions/2932 2 | import mimetypes 3 | import os 4 | 5 | import gradio.routes 6 | 7 | mimetypes.init() 8 | mimetypes.add_type("application/javascript", ".js") 9 | 10 | 11 | class ScriptLoader: 12 | path_map = { 13 | "js": os.path.abspath(os.path.join(os.path.dirname(__file__), "javascript")), 14 | "py": os.path.abspath(os.path.join(os.path.dirname(__file__), "python")), 15 | } 16 | 17 | def __init__(self, script_type): 18 | self.script_type = script_type 19 | self.path = ScriptLoader.path_map[script_type] 20 | self.loaded_scripts = [] 21 | 22 | @staticmethod 23 | def get_scripts(path: str, file_type: str) -> list[tuple[str, str]]: 24 | scripts = [] 25 | dir_list = [os.path.join(path, f) for f in os.listdir(path)] 26 | files_list = [f for f in dir_list if os.path.isfile(f)] 27 | for s in files_list: 28 | # Dont forget the "." for file extension 29 | if os.path.splitext(s)[1] == f".{file_type}": 30 | scripts.append((s, os.path.basename(s))) 31 | return scripts 32 | 33 | 34 | class JavaScriptLoader(ScriptLoader): 35 | def __init__(self): 36 | super().__init__("js") 37 | self.original_template = gradio.routes.templates.TemplateResponse 38 | self.load_js() 39 | gradio.routes.templates.TemplateResponse = self.template_response 40 | 41 | def load_js(self): 42 | js_scripts = ScriptLoader.get_scripts(self.path, self.script_type) 43 | for file_path, file_name in js_scripts: 44 | with open(file_path, "r", encoding="utf-8") as file: 45 | self.loaded_scripts.append( 46 | f"\n\n" 47 | ) 48 | 49 | def template_response(self, *args, **kwargs): 50 | response = self.original_template(*args, **kwargs) 51 | response.body = response.body.replace( 52 | "".encode("utf-8"), 53 | f"{''.join(self.loaded_scripts)}\n".encode("utf-8"), 54 | ) 55 | response.init_headers() 56 | return response 57 | -------------------------------------------------------------------------------- /llmadmin/frontend/app.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import re 4 | import uuid 5 | from typing import Any, Dict, List 6 | import ray 7 | import requests 8 | 9 | from llmadmin.common.backend import get_llmadmin_backend 10 | from llmadmin.common.constants import ( 11 | AVIARY_DESC, 12 | CSS, 13 | EXAMPLES_IF, 14 | EXAMPLES_QA, 15 | EXAMPLES_ST, 16 | HEADER, 17 | LOGO_ANYSCALE, 18 | LOGO_GITHUB, 19 | LOGO_RAY, 20 | LOGO_RAY_TYPEFACE, 21 | MODEL_DESCRIPTION_FORMAT, 22 | MODEL_DESCRIPTIONS_HEADER, 23 | MODELS, 24 | NUM_LLM_OPTIONS, 25 | PROJECT_NAME, 26 | SELECTION_DICT, 27 | SUB_HEADER, 28 | ) 29 | from llmadmin.frontend.javascript_loader import JavaScriptLoader 30 | from llmadmin.frontend.leaderboard import DummyLeaderboard, Leaderboard 31 | from llmadmin.frontend.mongo_secrets import get_mongo_secret_url 32 | from llmadmin.frontend.utils import ( 33 | DEFAULT_STATS, 34 | LOGGER, 35 | THEME, 36 | blank, 37 | deactivate_buttons, 38 | gen_stats, 39 | log_flags, 40 | paused_logger, 41 | select_button, 42 | unset_buttons, 43 | ) 44 | 45 | std_logger = logging.getLogger("ray.logger") 46 | 47 | @ray.remote(num_cpus=0) 48 | def completions(bakend, prompt, llm, index): 49 | try: 50 | out = bakend.completions(prompt=prompt, llm=llm) 51 | except Exception as e: 52 | if isinstance(e, requests.ReadTimeout) or ( 53 | hasattr(e, "response") 54 | and ("timeout" in e.response or e.response.status_code in (408, 504)) 55 | ): 56 | out = ( 57 | "[LLM-ADMIN] The request timed out. This usually means the server " 58 | "is experiencing a higher than usual load. " 59 | "Please try again in a few minutes." 60 | ) 61 | elif hasattr(e, "response"): 62 | out = ( 63 | f"[LLM-ADMIN] Backend returned an error. " 64 | f"Status code: {e.response.status_code}" 65 | f"\nResponse: {e.response.text.split('raise ')[-1]}" 66 | ).replace("\n", " ") 67 | else: 68 | out = f"[LLM-ADMIN] An error occurred. Please try again.\nError: {e}" 69 | out = {"error": out} 70 | return out, index -------------------------------------------------------------------------------- /llmadmin/frontend/javascript/llmadmin.js: -------------------------------------------------------------------------------- 1 | // Set favicon 2 | const FAVICON = 3 | "data:image/svg+xml,🦜"; 4 | function setFavicon(link) { 5 | let favicon = document.querySelector('link[rel="icon"]'); 6 | 7 | if (favicon) { 8 | favicon.href = link; 9 | } else { 10 | favicon = document.createElement("link"); 11 | favicon.rel = "icon"; 12 | favicon.href = link; 13 | 14 | document.head.appendChild(favicon); 15 | } 16 | } 17 | // setFavicon(FAVICON); 18 | 19 | // Get news 20 | const NEWS_URL = "https://api.github.com/repos/ray-project/llmadmin/issues/8"; 21 | function getNews(newsUrl) { 22 | return fetch(newsUrl) 23 | .then((response) => { 24 | if (!response.ok) { 25 | throw new Error("Unable to fetch news."); 26 | } 27 | return response.text(); 28 | }) 29 | .then((data) => { 30 | return (title = JSON.parse(data)["title"]); 31 | }) 32 | .catch((error) => console.error("Unable to parse response: ", error)); 33 | } 34 | 35 | // Wait for the ticker div to be added to DOM to set the news content 36 | const observer = new MutationObserver((mutationsList, observer) => { 37 | for (let mutation of mutationsList) { 38 | if (mutation.type === "childList") { 39 | let element = document.getElementsByClassName("ticker"); 40 | if (element.length > 0) { 41 | getNews(NEWS_URL).then((newsTitle) => { 42 | document.getElementsByClassName("ticker")[0].innerHTML = 43 | "\uD83D\uDCE3 " + newsTitle; 44 | }); 45 | observer.disconnect(); 46 | break; 47 | } 48 | } 49 | } 50 | }); 51 | 52 | (function () { 53 | // Add Google Tag Manager 54 | const head = document.getElementsByTagName("head")[0]; 55 | var gtm = document.createElement("script"); 56 | gtm.text = 57 | "(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-5ZPDX2P');"; 58 | head.insertBefore(gtm, head.children[0]); 59 | 60 | document.addEventListener("DOMContentLoaded", function () { 61 | observer.observe(document.body, { childList: true, subtree: true }); 62 | }); 63 | })(); -------------------------------------------------------------------------------- /llmadmin/frontend/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | # import gradio as gr 4 | 5 | from llmadmin.common.constants import ( 6 | G5_COST_PER_S_IN_DOLLARS, 7 | NUM_LLM_OPTIONS, 8 | PROJECT_NAME, 9 | ) 10 | from llmadmin.frontend.mongo_logger import MongoLogger 11 | from llmadmin.frontend.mongo_secrets import get_mongo_secret_url 12 | 13 | LOGGER = None 14 | 15 | # MONGODB_URL = get_mongo_secret_url() 16 | # if MONGODB_URL: 17 | # LOGGER = MongoLogger(url=MONGODB_URL, project_name=PROJECT_NAME) 18 | # else: 19 | # print("No MongoDB logger defined, will default to the CSVLogger") 20 | # LOGGER = gr.CSVLogger() 21 | # LOGGER = gr.CSVLogger() 22 | 23 | 24 | DEFAULT_STATS = t = """ 25 | | | | 26 | |---|---| 27 | | Latency [s] | - | 28 | | Cost [$] | - | 29 | | Tokens (i/o) | - | 30 | | Per 1K Tokens [$] | - | 31 | """ 32 | 33 | 34 | def gen_stats(dictionary): 35 | cost_per_k = ( 36 | dictionary["total_time"] 37 | * G5_COST_PER_S_IN_DOLLARS 38 | / dictionary["num_total_tokens"] 39 | * 1000 40 | ) 41 | 42 | return f""" 43 | | | | 44 | |---|---| 45 | | Lat [s] | {dictionary['total_time']:.1f} | 46 | | Cost [$] | {dictionary['total_time'] * G5_COST_PER_S_IN_DOLLARS:.4f} | 47 | | Tokens (i/o) | {dictionary['num_total_tokens']:.1f} | 48 | | Per 1K Tok [$] | {cost_per_k:.4f} | 49 | """ 50 | 51 | 52 | def blank(): 53 | return "" 54 | 55 | 56 | # def select_button(button): 57 | # return button, gr.Button.update(variant="primary") 58 | 59 | 60 | # def deactivate_buttons(): 61 | # return [gr.Button.update(interactive=False)] * NUM_LLM_OPTIONS 62 | 63 | 64 | # def unset_buttons(): 65 | # return [gr.Button.update(variant="secondary", interactive=True)] * NUM_LLM_OPTIONS 66 | 67 | 68 | # def paused_logger(*args): 69 | # time.sleep(1) 70 | # LOGGER.flag(*args) 71 | 72 | 73 | # def log_flags(*args): 74 | # LOGGER.flag(args) 75 | 76 | 77 | # THEME = gr.themes.Default( 78 | # primary_hue="blue", 79 | # secondary_hue="blue", 80 | # ).set( 81 | # border_color_accent="blue", 82 | # shadow_spread="20", 83 | # shadow_spread_dark="0", 84 | # button_primary_background_fill="*primary_200", 85 | # button_primary_background_fill_dark="*primary_700", 86 | # button_primary_border_color_dark="*primary_600", 87 | # ) 88 | -------------------------------------------------------------------------------- /llmadmin/backend/server/_batch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from dataclasses import dataclass, field 3 | from enum import IntEnum 4 | from functools import wraps 5 | from typing import Any, Callable, List, Optional, Tuple, Type 6 | 7 | # TODO: Upstream to Serve. 8 | 9 | 10 | def extract_self_if_method_call(args: List[Any], func: Callable) -> Optional[object]: 11 | """Check if this is a method rather than a function. 12 | 13 | Does this by checking to see if `func` is the attribute of the first 14 | (`self`) argument under `func.__name__`. Unfortunately, this is the most 15 | robust solution to this I was able to find. It would also be preferable 16 | to do this check when the decorator runs, rather than when the method is. 17 | 18 | Returns the `self` object if it's a method call, else None. 19 | 20 | Arguments: 21 | args: arguments to the function/method call. 22 | func: the unbound function that was called. 23 | """ 24 | if len(args) > 0: 25 | method = getattr(args[0], func.__name__, False) 26 | if method: 27 | wrapped = getattr(method, "__wrapped__", False) 28 | if wrapped and wrapped == func: 29 | return args[0] 30 | 31 | return None 32 | 33 | 34 | class QueuePriority(IntEnum): 35 | """Lower value = higher priority""" 36 | 37 | GENERATE_TEXT = 0 38 | BATCH_GENERATE_TEXT = 1 39 | 40 | 41 | @dataclass(order=True) 42 | class _PriorityWrapper: 43 | """Wrapper allowing for priority queueing of arbitrary objects.""" 44 | 45 | obj: Any = field(compare=False) 46 | priority: int = field(compare=True) 47 | 48 | 49 | class PriorityQueueWithUnwrap(asyncio.PriorityQueue): 50 | def get_nowait(self) -> Any: 51 | # Get just the obj from _PriorityWrapper 52 | ret: _PriorityWrapper = super().get_nowait() 53 | return ret.obj 54 | 55 | 56 | def _validate_max_batch_size(max_batch_size): 57 | if not isinstance(max_batch_size, int): 58 | if isinstance(max_batch_size, float) and max_batch_size.is_integer(): 59 | max_batch_size = int(max_batch_size) 60 | else: 61 | raise TypeError("max_batch_size must be integer >= 1") 62 | 63 | if max_batch_size < 1: 64 | raise ValueError("max_batch_size must be an integer >= 1") 65 | 66 | 67 | def _validate_batch_wait_timeout_s(batch_wait_timeout_s): 68 | if not isinstance(batch_wait_timeout_s, (float, int)): 69 | raise TypeError("batch_wait_timeout_s must be a float >= 0") 70 | 71 | if batch_wait_timeout_s < 0: 72 | raise ValueError("batch_wait_timeout_s must be a float >= 0") 73 | 74 | 75 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/tasks/sequenceclassification_glue_cola.py: -------------------------------------------------------------------------------- 1 | from ._base import Task 2 | from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding 3 | from typing import Any 4 | import pandas as pd 5 | import evaluate 6 | import numpy as np 7 | 8 | 9 | class SequenceclassificationGlueCola(Task): 10 | AUTO_MODEL_CLASS = AutoModelForSequenceClassification 11 | 12 | DATASET_PATH = "glue" 13 | DATASET_NAME = "cola" 14 | 15 | def get_data_proprocess(self) -> Any: 16 | tokenizer = self.tokenizer 17 | 18 | # adopt python decorator TODO 19 | def preprocess_function(examples: pd.DataFrame): 20 | # examples = examples.to_dict("list") 21 | ret = tokenizer(examples["sentence"], truncation=True) 22 | 23 | # Add back the original columns 24 | ret = {**examples, **ret} 25 | return pd.DataFrame.from_dict(ret) 26 | 27 | return preprocess_function 28 | 29 | def get_compute_metrics(self) -> Any: 30 | DATASET_PATH = self.DATASET_PATH 31 | DATASET_NAME = self.DATASET_NAME 32 | 33 | def compute_metrics(eval_preds): 34 | metric = evaluate.load(DATASET_PATH, DATASET_NAME) 35 | logits, labels = eval_preds 36 | predictions = np.argmax(logits, axis=-1) 37 | return metric.compute(predictions=predictions, references=labels) 38 | 39 | return compute_metrics 40 | 41 | def get_data_collator(self) -> Any: 42 | data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) 43 | return data_collator 44 | 45 | def training_key(self): 46 | """ 47 | :return: Iterable[obj] 48 | A iterable of any object, that doc_to_text can handle 49 | """ 50 | return "train" 51 | 52 | def validation_key(self): 53 | """ 54 | :return: Iterable[obj] 55 | A iterable of any object, that doc_to_text can handle 56 | """ 57 | return "validation" 58 | 59 | def getTrainDataSet(self): 60 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True) 61 | 62 | def getEvalDataSet(self): 63 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True) 64 | 65 | def getSmallTrainDataSet(self, len: int): 66 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True) 67 | 68 | def getSmallEvalDataSet(self, len: int): 69 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True) -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/tasks/sequenceclassification_glue_mrpc.py: -------------------------------------------------------------------------------- 1 | from ._base import Task 2 | from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding 3 | from typing import Any 4 | import pandas as pd 5 | import evaluate 6 | import numpy as np 7 | 8 | 9 | class SequenceclassificationGlueMrpc(Task): 10 | AUTO_MODEL_CLASS = AutoModelForSequenceClassification 11 | 12 | DATASET_PATH = "glue" 13 | DATASET_NAME = "mrpc" 14 | FROM_PRETRAINED_KWARGS = { 15 | # "num_labels": 2 16 | } 17 | 18 | def get_data_proprocess(self) -> Any: 19 | tokenizer = self.tokenizer 20 | 21 | # adopt python decorator TODO 22 | def preprocess_function(examples: pd.DataFrame): 23 | # examples = examples.to_dict("list") 24 | ret = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=self.ft_config.train_config.base_config.max_length) 25 | 26 | # Add back the original columns 27 | ret = {**examples, **ret} 28 | return pd.DataFrame.from_dict(ret) 29 | 30 | return preprocess_function 31 | 32 | def get_compute_metrics(self) -> Any: 33 | DATASET_PATH = self.DATASET_PATH 34 | DATASET_NAME = self.DATASET_NAME 35 | 36 | def compute_metrics(eval_preds): 37 | metric = evaluate.load(DATASET_PATH, DATASET_NAME) 38 | logits, labels = eval_preds 39 | predictions = np.argmax(logits, axis=-1) 40 | return metric.compute(predictions=predictions, references=labels) 41 | 42 | return compute_metrics 43 | 44 | def get_data_collator(self) -> Any: 45 | data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) 46 | return data_collator 47 | 48 | def training_key(self): 49 | """ 50 | :return: Iterable[obj] 51 | A iterable of any object, that doc_to_text can handle 52 | """ 53 | return "train" 54 | 55 | def validation_key(self): 56 | """ 57 | :return: Iterable[obj] 58 | A iterable of any object, that doc_to_text can handle 59 | """ 60 | return "validation" 61 | 62 | def getTrainDataSet(self): 63 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True) 64 | 65 | def getEvalDataSet(self): 66 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True) 67 | 68 | def getSmallTrainDataSet(self, len: int): 69 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True) 70 | 71 | def getSmallEvalDataSet(self, len: int): 72 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True) 73 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/initializers/_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | from transformers import PreTrainedModel, PreTrainedTokenizer 6 | 7 | from llmadmin.backend.logger import get_logger 8 | 9 | logger = get_logger(__name__) 10 | 11 | 12 | class LLMInitializer(ABC): 13 | """Initialize model and tokenizer and place them on the correct device. 14 | 15 | Args: 16 | device (torch.device): Device to place model and tokenizer on. 17 | world_size (int): Number of GPUs to use. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | device: torch.device, 23 | world_size: int, 24 | ): 25 | self.device = device 26 | self.world_size = world_size 27 | 28 | def load(self, model_id: str) -> Tuple["PreTrainedModel", "PreTrainedTokenizer"]: 29 | """Load model and tokenizer. 30 | 31 | Args: 32 | model_id (str): Hugging Face model ID. 33 | """ 34 | model = self.load_model(model_id) 35 | tokenizer = self.load_tokenizer(model_id) 36 | return self.postprocess(model, tokenizer) 37 | 38 | @abstractmethod 39 | def load_model(self, model_id: str) -> "PreTrainedModel": 40 | """Load model. 41 | 42 | Args: 43 | model_id (str): Hugging Face model ID. 44 | """ 45 | pass 46 | 47 | @abstractmethod 48 | def load_tokenizer(self, tokenizer_id: str) -> "PreTrainedTokenizer": 49 | """Load tokenizer. 50 | 51 | Args: 52 | tokenizer_id (str): Hugging Face tokenizer name. 53 | """ 54 | pass 55 | 56 | def postprocess( 57 | self, model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer" 58 | ) -> Tuple["PreTrainedModel", "PreTrainedTokenizer"]: 59 | """Postprocess model and tokenizer. 60 | 61 | Args: 62 | model (PreTrainedModel): Model to postprocess. 63 | tokenizer (PreTrainedTokenizer): Tokenizer to postprocess. 64 | """ 65 | return self.postprocess_model(model), self.postprocess_tokenizer(tokenizer) 66 | 67 | def postprocess_model(self, model: "PreTrainedModel") -> "PreTrainedModel": 68 | """Postprocess model. 69 | 70 | Args: 71 | model (PreTrainedModel): Model to postprocess. 72 | """ 73 | return model 74 | 75 | def postprocess_tokenizer( 76 | self, tokenizer: "PreTrainedTokenizer" 77 | ) -> "PreTrainedTokenizer": 78 | """Postprocess tokenizer. 79 | 80 | Args: 81 | tokenizer (PreTrainedTokenizer): Tokenizer to postprocess. 82 | """ 83 | return tokenizer 84 | 85 | def get_model_init_kwargs(self) -> dict: 86 | """Load tokenizer. 87 | 88 | Args: 89 | tokenizer_id (str): Hugging Face tokenizer name. 90 | """ 91 | return {} -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/tasks/sequenceclassification_yelp_review_full.py: -------------------------------------------------------------------------------- 1 | from ._base import Task 2 | from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding 3 | from typing import Any 4 | import pandas as pd 5 | import evaluate 6 | import numpy as np 7 | 8 | 9 | class SequenceclassificationYelpReviewFull(Task): 10 | AUTO_MODEL_CLASS = AutoModelForSequenceClassification 11 | 12 | DATASET_PATH = "yelp_review_full" 13 | DATASET_NAME = "" 14 | FROM_PRETRAINED_KWARGS = { 15 | "num_labels": 5 16 | } 17 | 18 | def get_data_proprocess(self) -> Any: 19 | tokenizer = self.tokenizer 20 | 21 | # adopt python decorator TODO 22 | def preprocess_function(examples: pd.DataFrame): 23 | examples = examples.to_dict("list") 24 | ret = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True) 25 | 26 | # Add back the original columns 27 | ret = {**examples, **ret} 28 | return pd.DataFrame.from_dict(ret) 29 | 30 | return preprocess_function 31 | 32 | def get_compute_metrics(self) -> Any: 33 | DATASET_PATH = self.DATASET_PATH 34 | DATASET_NAME = self.DATASET_NAME 35 | 36 | def compute_metrics(eval_preds): 37 | # metric = evaluate.load(DATASET_PATH, DATASET_NAME) 38 | metric = evaluate.load("accuracy") 39 | logits, labels = eval_preds 40 | predictions = np.argmax(logits, axis=-1) 41 | return metric.compute(predictions=predictions, references=labels) 42 | 43 | return compute_metrics 44 | 45 | def get_data_collator(self) -> Any: 46 | data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) 47 | return data_collator 48 | 49 | def training_key(self): 50 | """ 51 | :return: Iterable[obj] 52 | A iterable of any object, that doc_to_text can handle 53 | """ 54 | return "train" 55 | 56 | def validation_key(self): 57 | """ 58 | :return: Iterable[obj] 59 | A iterable of any object, that doc_to_text can handle 60 | """ 61 | return "validation" 62 | 63 | def tokenize_function(self, examples): 64 | return self.tokenizer(examples["text"], padding="max_length", truncation=True, max_length=self.ft_config.train_config.base_config.max_length) 65 | 66 | def getTrainDataSet(self): 67 | return self.dataset[self.training_key()].map(self.tokenize_function, batched=True) 68 | 69 | def getEvalDataSet(self): 70 | return self.dataset[self.validation_key()].map(self.tokenize_function, batched=True) 71 | 72 | def getSmallTrainDataSet(self, len: int): 73 | return self.dataset[self.training_key()].select(range(len)).map(self.tokenize_function, batched=True) 74 | 75 | def getSmallEvalDataSet(self, len: int): 76 | return self.dataset[self.validation_key()].select(range(len)).map(self.tokenize_function, batched=True) 77 | -------------------------------------------------------------------------------- /llmadmin/frontend/mongo_logger.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from datetime import datetime, timezone 3 | from typing import Any 4 | 5 | # from gradio import FlaggingCallback 6 | from pymongo import MongoClient 7 | 8 | from llmadmin.common.constants import COLLECTION_NAME, DB_NAME 9 | from llmadmin.common.llm_event import LlmEvent, LlmResponse, Vote 10 | 11 | 12 | # class MongoLogger(FlaggingCallback): 13 | # """Logs flagged events to Mongo DB.""" 14 | 15 | # def __init__(self, url, project_name) -> None: 16 | # self.url = url 17 | # self.client = MongoClient(url) 18 | # self.project_name = project_name 19 | # self.components = None 20 | # try: 21 | # self.client.admin.command("ping") 22 | # print("Pinged MongoDB. Correctly set up") 23 | # except Exception as e: 24 | # print(e) 25 | 26 | # def setup(self, components): 27 | # self.components = components 28 | # # Check if the database exists 29 | # if DB_NAME in self.client.list_database_names(): 30 | # self.db = self.client[DB_NAME] 31 | # print(f"Database '{DB_NAME}' already exists.") 32 | # else: 33 | # # The database doesn't exist, so create it 34 | # self.db = self.client[DB_NAME] 35 | # print(f"Database '{DB_NAME}' created.") 36 | 37 | # # OK, now we create a collection. 38 | # # Check if the collection exists 39 | # if COLLECTION_NAME in self.db.list_collection_names(): 40 | # # The collection exists 41 | # print( 42 | # f"Collection '{COLLECTION_NAME}' already exists in database '{DB_NAME}'." 43 | # ) 44 | # else: 45 | # # The collection doesn't exist, so create it 46 | # self.db.create_collection(COLLECTION_NAME) 47 | # print(f"Collection '{COLLECTION_NAME}' created in database '{DB_NAME}'.") 48 | 49 | # def flag(self, flag_data: list[Any], flag_option: str = "", username: str = ""): 50 | # print(f"last value is: {flag_data}") 51 | # event = LlmEvent( 52 | # project_name=self.project_name, 53 | # created_at=datetime.now(timezone.utc), 54 | # instance_id=str(uuid.uuid4()), 55 | # user_prompt=flag_data[0], 56 | # # TODO(mwk): Work out how to generalize this to _n_ inputs 57 | # responses=[ 58 | # LlmResponse( 59 | # model_id=flag_data[1], text=flag_data[4], gen_stats=flag_data[8][0] 60 | # ), 61 | # LlmResponse( 62 | # model_id=flag_data[2], text=flag_data[5], gen_stats=flag_data[8][1] 63 | # ), 64 | # LlmResponse( 65 | # model_id=flag_data[3], text=flag_data[6], gen_stats=flag_data[8][2] 66 | # ), 67 | # ], 68 | # session_id=flag_data[9], 69 | # ) 70 | # if flag_data[7]: 71 | # vote_number = int(flag_data[7][-1]) 72 | # event.votes = Vote(llm=flag_data[vote_number], score=1) 73 | 74 | # print(f"Event is {event.json()}") 75 | # result = self.client[DB_NAME][COLLECTION_NAME].insert_one(event.dict()) 76 | # print(f"Mongo result {result}") 77 | -------------------------------------------------------------------------------- /llmadmin/common/evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | 5 | 6 | class GPT: 7 | """A simple wrapper around the OpenAI API for evaluating GPT models.""" 8 | 9 | def __init__(self, model_version="gpt-4", temperature=0.9, max_tokens=2048): 10 | api_key = os.getenv("GPT4_API_KEY") 11 | assert api_key, "Please set the GPT4_API_KEY environment variable" 12 | self.__api_key = os.getenv("GPT4_API_KEY") 13 | self.temperature = temperature 14 | self.max_tokens = max_tokens 15 | self.model = model_version 16 | 17 | def evaluate_results(self, prompt, results): 18 | """Evaluate a list of results generated by several models on a single prompt.""" 19 | for result in results: 20 | result.pop("stats", None) 21 | 22 | gpt_messages = [ 23 | { 24 | "role": "system", 25 | "content": ( 26 | """You are an assistant tasked with ranking responses in 27 | order of quality, creating a leaderboard of all models. 28 | The best model has rank 1, the second best has rank 2, etc. 29 | You have to assess the quality of the responses, and rank them.""" 30 | ), 31 | }, 32 | { 33 | "role": "user", 34 | "content": ( 35 | f"""You are given a prompt and a list of responses 36 | from several models in Python dictionary format. 37 | Specifically, the format of the results is as follows: 38 | 39 | 'model': , 'result': 40 | 41 | Your job is to "rank" the responses in order of quality, (not by 42 | the order in which they were generated). 43 | 44 | The prompt is: {prompt} 45 | The responses are: {results} 46 | 47 | Please rank the responses by quality, and return a list of the model 48 | names and ranks, i.e produce the following output: 49 | 50 | 'model': , 'rank': 51 | 52 | Only output this format, and nothing else. Your response must 53 | be a valid Python dictionary. 54 | Think step by step and give me this quality ranking. 55 | """ 56 | ), 57 | }, 58 | ] 59 | return self.generate(gpt_messages) 60 | 61 | def generate(self, messages): 62 | data = { 63 | "model": self.model, 64 | "messages": messages, 65 | "max_tokens": self.max_tokens, 66 | "temperature": self.temperature, 67 | } 68 | headers = { 69 | "Content-Type": "application/json", 70 | "Authorization": f"Bearer {self.__api_key}", 71 | } 72 | resp = requests.post( 73 | url="https://api.openai.com/v1/chat/completions", json=data, headers=headers 74 | ) 75 | 76 | if not resp.ok: 77 | raise RuntimeError(f"Failed to generate: {resp.reason}") 78 | 79 | return resp.json()["choices"][0]["message"]["content"] 80 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/pipelines/processors.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | import torch 4 | from transformers import LogitsProcessor, StoppingCriteria 5 | 6 | from llmadmin.backend.logger import get_logger 7 | 8 | logger = get_logger(__name__) 9 | 10 | 11 | class StopOnTokens(StoppingCriteria): 12 | """ 13 | Stopping criteria to allow stopping on multi-token sequences. 14 | 15 | ``first_stopping_token_in_batch`` attribute can be used for postprocessing after 16 | generation. 17 | 18 | Args: 19 | stopping_sequences (List[Union[List[int], int]]): List of sequences to stop on. 20 | """ 21 | 22 | def __init__(self, stopping_sequences: List[Union[List[int], int]]) -> None: 23 | self.stopping_sequences = stopping_sequences 24 | self.stop_ids = [ 25 | torch.LongTensor([stop_id] if not isinstance(stop_id, list) else stop_id) 26 | for stop_id in self.stopping_sequences 27 | ] 28 | self.first_stopping_token_in_batch = {} 29 | 30 | def __call__( 31 | self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs 32 | ) -> bool: 33 | for batch_index, batch in enumerate(input_ids): 34 | if batch_index not in self.first_stopping_token_in_batch: 35 | for stop_id in self.stop_ids: 36 | if len(batch) > len(stop_id) and batch[-len(stop_id) :].equal( 37 | stop_id.to(batch.device) 38 | ): 39 | self.first_stopping_token_in_batch[batch_index] = len(batch) - 1 40 | break 41 | return len(self.first_stopping_token_in_batch) == len(input_ids) 42 | 43 | 44 | class StopOnTokensLogitsProcessor(LogitsProcessor): 45 | """ 46 | Processor to force only EOS token after encountering a stopping sequence. 47 | 48 | Args: 49 | stopping_sequences (List[Union[List[int], int]]): List of sequences to stop on. 50 | eos_token_id (Union[int, List[int]]): EOS token id(s). 51 | """ 52 | 53 | def __init__( 54 | self, 55 | stopping_sequences: List[Union[List[int], int]], 56 | eos_token_id: Union[int, List[int]], 57 | ) -> None: 58 | if isinstance(eos_token_id, int): 59 | eos_token_id = [eos_token_id] 60 | self.eos_token_id = eos_token_id 61 | self.stop_ids = [ 62 | torch.LongTensor([stop_id] if not isinstance(stop_id, list) else stop_id) 63 | for stop_id in stopping_sequences 64 | ] 65 | self._stopped_batches = set() 66 | self._nulled_batch = None 67 | 68 | def __call__( 69 | self, input_ids: torch.LongTensor, scores: torch.FloatTensor 70 | ) -> torch.FloatTensor: 71 | for batch_index, batch in enumerate(input_ids): 72 | if batch_index not in self._stopped_batches: 73 | for stop_id in self.stop_ids: 74 | if len(batch) > len(stop_id) and batch[-len(stop_id) :].equal( 75 | stop_id.to(batch.device) 76 | ): 77 | self._stopped_batches.add(batch_index) 78 | break 79 | if batch_index in self._stopped_batches: 80 | if self._nulled_batch is None: 81 | scores[batch_index, :] = -float("inf") 82 | scores[batch_index, self.eos_token_id] = 0 83 | self._nulled_batch = scores[batch_index] 84 | scores[batch_index] = self._nulled_batch 85 | return scores 86 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/initializers/llamacpp.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union 3 | 4 | import torch 5 | from huggingface_hub import hf_hub_download 6 | 7 | from llmadmin.backend.logger import get_logger 8 | 9 | from ._base import LLMInitializer 10 | 11 | if TYPE_CHECKING: 12 | from llama_cpp import Llama 13 | 14 | logger = get_logger(__name__) 15 | 16 | 17 | class LlamaCppTokenizer: 18 | """Thin wrapper around a llama_cpp model to provide a subset of the PreTrainedTokenizer interface""" 19 | 20 | def __init__(self, model: "Llama") -> None: 21 | self.model = model 22 | 23 | def decode(self, tokens: Union[List[int], List[List[int]]], **kwargs) -> str: 24 | if not tokens: 25 | return tokens 26 | if isinstance(tokens[0], int): 27 | return self.model.detokenize(tokens).decode("utf-8") 28 | return [self.decode(t) for t in tokens] 29 | 30 | def encode(self, text: Union[str, List[str], List[List[str]]], **kwargs) -> str: 31 | if isinstance(text, str): 32 | return self.model.tokenize(text.encode("utf-8")) 33 | return [self.encode(t) for t in text] 34 | 35 | def batch_encode(self, text: Union[List[str], List[List[str]]], **kwargs) -> str: 36 | return self.encode(text) 37 | 38 | def __call__(self, text: Union[str, List[str], List[List[str]]], **kwargs): 39 | return self.encode(text, **kwargs) 40 | 41 | 42 | class LlamaCppInitializer(LLMInitializer): 43 | """Initialize llama_cpp model and tokenizer. 44 | 45 | Args: 46 | device (torch.device): Device to place model and tokenizer on. 47 | world_size (int): Number of GPUs to use. 48 | model_filename (str): Name of the model file to download from HuggingFace Hub. 49 | This needs to be in the ``model_id`` repository (passed to ``self.load()``). 50 | **model_init_kwargs: Keyword arguments to pass to the llama_cpp model init. 51 | """ 52 | 53 | def __init__( 54 | self, 55 | device: torch.device, 56 | world_size: int, 57 | model_filename: str, 58 | **model_init_kwargs, 59 | ): 60 | super().__init__( 61 | device=device, 62 | world_size=world_size, 63 | ) 64 | self.model_filename = model_filename 65 | self.model_init_kwargs = model_init_kwargs 66 | 67 | def _get_model_init_kwargs(self) -> Dict[str, Any]: 68 | return { 69 | # We use a large integer to put all of the layers on GPU by default. 70 | "n_gpu_layers": 0 if self.device.type == "cpu" else 10**6, 71 | "seed": 0, 72 | "verbose": False, 73 | "n_threads": int(os.environ["OMP_NUM_THREADS"]), 74 | **self.model_init_kwargs, 75 | } 76 | 77 | def load_model(self, model_id: str) -> "Llama": 78 | logger.info(f"LlamaCppInitializer downloading {model_id} : {self.model_filename}") 79 | model_path = hf_hub_download(model_id, self.model_filename) 80 | logger.info(f"LlamaCppInitializer Loading model {model_path}") 81 | # Lazy import to avoid issues on CPU head node 82 | from llama_cpp import Llama 83 | 84 | return Llama( 85 | model_path=os.path.abspath(model_path), 86 | **self._get_model_init_kwargs(), 87 | ) 88 | 89 | def load_tokenizer(self, tokenizer_name: str) -> None: 90 | return None 91 | 92 | def postprocess( 93 | self, model: "Llama", tokenizer: None 94 | ) -> Tuple["Llama", LlamaCppTokenizer]: 95 | return super().postprocess(model, LlamaCppTokenizer(model)) -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/tasks/_base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from abc import abstractmethod 3 | from typing import Any 4 | from llmadmin.backend.server.models import DataConfig 5 | from datasets import load_dataset 6 | from datasets import load_metric 7 | import transformers 8 | from transformers import PreTrainedTokenizer, PreTrainedModel 9 | from typing import Any, Dict 10 | from llmadmin.backend.server.models import FTConfig 11 | from llmadmin.backend.logger import get_logger 12 | 13 | logger = get_logger(__name__) 14 | 15 | class Task(abc.ABC): 16 | AUTO_MODEL_CLASS: transformers.AutoModel = None 17 | 18 | # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub 19 | # or a path to a custom `datasets` loading script. 20 | DATASET_PATH: str = None 21 | 22 | # The name of a subset within `DATASET_PATH`. 23 | DATASET_NAME: str = None 24 | 25 | # kwargs when build model with transformer's "from_pretrained" 26 | FROM_PRETRAINED_KWARGS: Dict[str, Any] = None 27 | 28 | def __init__( 29 | self, 30 | tokenizer: "PreTrainedTokenizer", 31 | ft_config: "FTConfig", 32 | ) -> None: 33 | self.tokenizer = tokenizer 34 | self.ft_config = ft_config 35 | self.download_dataset() 36 | self._pre() 37 | 38 | @classmethod 39 | def from_tokenizer( 40 | cls, 41 | tokenizer: "PreTrainedTokenizer", 42 | ft_config: "FTConfig", 43 | ) -> "Task": 44 | fac = cls( 45 | tokenizer = tokenizer, 46 | ft_config = ft_config 47 | ) 48 | 49 | return fac 50 | 51 | @abstractmethod 52 | def get_data_proprocess(self) -> Any: 53 | """Change trainning data to tensor model can accepted""" 54 | pass 55 | 56 | @abstractmethod 57 | def get_compute_metrics(self) -> Any: 58 | pass 59 | 60 | @abstractmethod 61 | def get_data_collator(self) -> Any: 62 | pass 63 | 64 | def _pre(self) -> Any: 65 | pass 66 | 67 | @abstractmethod 68 | def training_key(self): 69 | """ 70 | :return: Iterable[obj] 71 | A iterable of any object, that doc_to_text can handle 72 | """ 73 | pass 74 | 75 | @abstractmethod 76 | def validation_key(self): 77 | """ 78 | :return: Iterable[obj] 79 | A iterable of any object, that doc_to_text can handle 80 | """ 81 | pass 82 | 83 | @abstractmethod 84 | def getTrainDataSet(self): 85 | pass 86 | 87 | @abstractmethod 88 | def getEvalDataSet(self): 89 | pass 90 | 91 | @abstractmethod 92 | def getSmallTrainDataSet(self, len: int): 93 | pass 94 | 95 | @abstractmethod 96 | def getSmallEvalDataSet(self, len: int): 97 | pass 98 | 99 | def get_dataset(self): 100 | return self.dataset 101 | 102 | def download_dataset(self): 103 | # Downloading and loading a dataset from the hub. 104 | logger.info("Start loading dataset") 105 | if self.ft_config.data_config.local_path: 106 | logger.info(f"Loading dataset from local path {self.ft_config.data_config.local_path}") 107 | raw_datasets = load_dataset(self.ft_config.data_config.local_path) 108 | else: 109 | if self.DATASET_NAME: 110 | logger.info(f"Downloading dataset {self.DATASET_NAME} from {self.DATASET_PATH}") 111 | raw_datasets = load_dataset(self.DATASET_PATH, self.DATASET_NAME) 112 | else: 113 | logger.info(f"Downloading dataset from {self.DATASET_PATH}") 114 | raw_datasets = load_dataset(self.DATASET_PATH) 115 | logger.info("Done load dataset") 116 | logger.info(f"{raw_datasets}") 117 | self.dataset = raw_datasets 118 | 119 | def set_model(self, model: PreTrainedModel): 120 | self.model = model 121 | 122 | def get_model(self): 123 | return self.model -------------------------------------------------------------------------------- /llm_finetune_ray.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import subprocess 4 | import ray 5 | import ray.util.scheduling_strategies 6 | 7 | 8 | def force_on_node(node_id: str, remote_func_or_actor_class): 9 | scheduling_strategy = ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( 10 | node_id=node_id, soft=False 11 | ) 12 | options = {"scheduling_strategy": scheduling_strategy} 13 | return remote_func_or_actor_class.options(**options) 14 | 15 | 16 | def run_on_every_node(remote_func_or_actor_class, *remote_args, **remote_kwargs): 17 | refs = [] 18 | for node in ray.nodes(): 19 | if node["Alive"] and node["Resources"].get("GPU", None): 20 | refs.append( 21 | force_on_node(node["NodeID"], remote_func_or_actor_class).remote( 22 | *remote_args, **remote_kwargs 23 | ) 24 | ) 25 | return ray.get(refs) 26 | 27 | 28 | @ray.remote(num_gpus=1) 29 | def mount_nvme(): 30 | if os.path.exists("/nvme"): 31 | return 32 | subprocess.run( 33 | 'drive_name="${1:-/dev/nvme1n1}"; mount_path="${2:-/nvme}"; set -x; sudo file -s "$drive_name"; sudo apt install xfsprogs -y; sudo mkfs -t xfs "$drive_name"; sudo mkdir "$mount_path" && sudo mount "$drive_name" "$mount_path" && sudo chown -R ray "$mount_path"', 34 | shell=True, 35 | check=True, 36 | ) 37 | 38 | 39 | @ray.remote(num_gpus=1) 40 | def download_model(base_model_name=None): 41 | base_model_name = ( 42 | base_model_name or "RWKV-4-Pile-1B5" 43 | ) # "RWKV-4-Pile-1B5", "RWKV-4-Pile-430M", "RWKV-4-Pile-169M" 44 | base_model_url = f"https://huggingface.co/BlinkDL/{base_model_name.lower()}" 45 | subprocess.run( 46 | f"cd /nvme; git lfs clone {base_model_url}; ls '{base_model_name.lower()}'", 47 | shell=True, 48 | check=True, 49 | ) 50 | 51 | 52 | @ray.remote(num_gpus=1) 53 | def download_pile_remote(dataset_name): 54 | subprocess.run( 55 | "rm -rf /nvme/enwik8; rm -rf /nvme/data/pile/; rm -rf ~/gpt-neox", 56 | shell=True, 57 | check=True, 58 | ) 59 | subprocess.run( 60 | "cd ~/; git clone https://github.com/Yard1/gpt-neox.git;", shell=True 61 | ) 62 | subprocess.run( 63 | f"cd ~/; cd gpt-neox; echo 'starting dataset download {dataset_name}'; python prepare_data.py {dataset_name} -d /nvme/data/pile -t HFTokenizer --vocab-file '/mnt/cluster_storage/20B_tokenizer.json' && echo 'download complete'", 64 | shell=True, 65 | check=True, 66 | ) 67 | 68 | 69 | def download_pile(dataset_name): 70 | subprocess.run( 71 | # Necessary for gpt-neox tokenizer to work 72 | "pip uninstall -y deepspeed && pip install --user -U git+https://github.com/EleutherAI/DeeperSpeed.git@eb7f5cff36678625d23db8a8fe78b4a93e5d2c75#egg=deepspeed", 73 | shell=True, 74 | ) 75 | try: 76 | run_on_every_node(download_pile_remote, dataset_name=dataset_name) 77 | finally: 78 | subprocess.run( 79 | # Use latest deepspeed for actual training. Will crash otherwise 80 | "pip uninstall -y deepspeed && pip install -U --user deepspeed", 81 | shell=True, 82 | ) 83 | 84 | 85 | @ray.remote(num_gpus=1) 86 | def clean_cache(): 87 | subprocess.run("rm -rf ~/.cache/torch_extensions", shell=True, check=True) 88 | 89 | 90 | @ray.remote(num_gpus=1) 91 | def run(cmd: str): 92 | subprocess.run(cmd, shell=True, check=True) 93 | 94 | 95 | if __name__ == "__main__": 96 | parser = argparse.ArgumentParser() 97 | 98 | parser.add_argument("function", type=str, help="function in this file to run") 99 | parser.add_argument("args", nargs="*", type=str, help="string args to function") 100 | args = parser.parse_args() 101 | 102 | ray.init() 103 | if args.function not in globals(): 104 | raise ValueError(f"{args.function} doesn't exist") 105 | fn = globals()[args.function] 106 | assert callable(fn) or hasattr(fn, "_function") 107 | print(f"Running {args.function}({', '.join(args.args)})") 108 | if hasattr(fn, "_function"): 109 | run_on_every_node(fn, *args.args) 110 | else: 111 | fn(*args.args) 112 | -------------------------------------------------------------------------------- /llmadmin/backend/server/run.py: -------------------------------------------------------------------------------- 1 | # import sys 2 | from typing import Dict, List, Union 3 | import ray 4 | from llmadmin.backend.server.app import ApiServer 5 | from llmadmin.backend.server.config import SERVE_RUN_HOST 6 | from llmadmin.backend.server.models import FTApp 7 | from llmadmin.backend.server.utils import parse_args, parse_args_ft 8 | # import uuid 9 | # import os 10 | from llmadmin.backend.llm.ft import TransformersFT 11 | from llmadmin.backend.llm.ft import RayTrain 12 | from llmadmin.backend.logger import get_logger 13 | from ray.serve._private.constants import DEFAULT_HTTP_PORT 14 | from llmadmin.backend.server.utils import get_serve_port 15 | from ray import serve 16 | 17 | # ray.init(address="auto") 18 | logger = get_logger(__name__) 19 | 20 | def run_ray_ft(ft: Union[FTApp, str]): 21 | """Run the LLM Train on the local Ray Cluster 22 | 23 | Args: 24 | model: A LLMApp objects or paths to yaml files defining LLMApps 25 | 26 | Example: 27 | run("models/model.yaml") # run one model in the model directory 28 | run(FTApp) # run a single LLMApp 29 | """ 30 | 31 | ft = parse_args_ft(ft) 32 | if not ft: 33 | raise RuntimeError("No valiabled fine tune defination were found.") 34 | 35 | if isinstance(ft, FTApp): 36 | logger.info(f"Initialized a Finetune instance of FTApp {ft.json(indent=2)}") 37 | else: 38 | raise RuntimeError("Not a Finetune App were found.") 39 | 40 | # ray._private.usage.usage_lib.record_library_usage("llmadmin") 41 | 42 | runner = RayTrain(ft) 43 | runner.train() 44 | 45 | def run_ft(ft: Union[FTApp, str]): 46 | """Run the LLM Server on the local Ray Cluster 47 | 48 | Args: 49 | model: A LLMApp objects or paths to yaml files defining LLMApps 50 | 51 | Example: 52 | run("models/model.yaml") # run one model in the model directory 53 | run(FTApp) # run a single LLMApp 54 | """ 55 | 56 | ft = parse_args_ft(ft) 57 | if not ft: 58 | raise RuntimeError("No valiabled fine tune defination were found.") 59 | 60 | if isinstance(ft, FTApp): 61 | logger.info(f"Initialized a Finetune instance of FTApp {ft.json(indent=2)}") 62 | else: 63 | raise RuntimeError("Not a Finetune App were found.") 64 | 65 | ray._private.usage.usage_lib.record_library_usage("llmadmin") 66 | 67 | runner = TransformersFT(ft) 68 | runner.train() 69 | 70 | def start_apiserver(port: int = DEFAULT_HTTP_PORT, resource_config: str = None, scale_config: str = None): 71 | """Run the API Server on the local Ray Cluster 72 | 73 | Args: 74 | *host: The host ip to run. 75 | *port: The port to run. 76 | 77 | """ 78 | scale_dict = dict() 79 | try: 80 | scale_dict = toDict(scale_config) 81 | except: 82 | raise ValueError(f"Invalid value of scale config '{scale_config}'") 83 | resource_dict = None 84 | try: 85 | resource_dict = toDict(resource_config) 86 | except: 87 | raise ValueError(f"Invalid value of resource config '{resource_config}'") 88 | 89 | # ray._private.usage.usage_lib.record_library_usage("llmfinetune") 90 | # ray.init(address="auto") 91 | serve_start_port = get_serve_start_port(port) 92 | app = ApiServer.options(autoscaling_config=scale_dict, ray_actor_options=resource_dict).bind() 93 | serve.start(http_options={"host": SERVE_RUN_HOST, "port": serve_start_port}) 94 | logger.info(f"Serve 'apiserver' is running at {SERVE_RUN_HOST}/{serve_start_port}") 95 | logger.info(f"Serve 'apiserver' run with resource: {resource_dict} , scale: {scale_dict}") 96 | serve.run(app, name="apiserver", route_prefix="/api") 97 | 98 | # parse k1=v1,k2=v2 to dict 99 | def toDict(kv: str) -> Dict: 100 | if kv: 101 | s = kv.replace(' ', ', ') 102 | return eval(f"dict({s})") 103 | else: 104 | return dict() 105 | 106 | def get_serve_start_port(port: int): 107 | serve_start_port = port 108 | serve_runtime_port = get_serve_port() 109 | if serve_runtime_port > -1: 110 | logger.info( 111 | f"Serve is already running at {SERVE_RUN_HOST}:{serve_runtime_port}") 112 | serve_start_port = serve_runtime_port 113 | return serve_start_port 114 | 115 | # if __name__ == "__main__": 116 | # run_ft(*sys.argv[1:]) 117 | 118 | 119 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The build output should clearly not be checked in 2 | .llm-ray/ 3 | *test-output.xml 4 | /bazel-* 5 | /python/ray/core 6 | /python/ray/pickle5_files/ 7 | /python/ray/thirdparty_files/ 8 | /python/ray/pyarrow_files/ 9 | /python/ray/jars/ 10 | /python/ray/cpp/ 11 | /python/build 12 | /python/dist 13 | /python/python-driver-* 14 | /python/ray/serve/generated 15 | /thirdparty/pkg/ 16 | /build/java 17 | .jar 18 | /dashboard/client/build 19 | finetune_models 20 | 21 | # Files generated by flatc should be ignored 22 | /src/ray/gcs/format/*_generated.h 23 | /src/ray/object_manager/format/*_generated.h 24 | /src/ray/raylet/format/*_generated.h 25 | /java/runtime/src/main/java/io/ray/runtime/generated/* 26 | /java/serve/src/main/java/io/ray/serve/generated/* 27 | 28 | # Files genrated by c++ worker should be ignored. 29 | /cpp/example/thirdparty/ 30 | /cpp/example/bazel-* 31 | /python/ray/cpp 32 | 33 | # Redis temporary files 34 | *dump.rdb 35 | 36 | # Python byte code files 37 | *.pyc 38 | python/.eggs 39 | 40 | # Backup files 41 | *.bak 42 | 43 | # Emacs temporary files 44 | *~ 45 | *# 46 | 47 | # Compiled Object files 48 | *.slo 49 | *.lo 50 | *.o 51 | *.xo 52 | *.obj 53 | 54 | # Precompiled Headers 55 | *.gch 56 | *.pch 57 | 58 | # Compiled Dynamic libraries 59 | *.so 60 | *.dylib 61 | *.dll 62 | python/ray/_raylet.pyd 63 | 64 | # Incremental linking files 65 | *.ilk 66 | 67 | # Library export files 68 | *.exp 69 | 70 | # Debug symbols 71 | *.pdb 72 | 73 | # Fortran module files 74 | *.mod 75 | !deploy/ray-operator/go.mod 76 | 77 | # Compiled Static libraries 78 | *.lai 79 | *.la 80 | *.a 81 | *.lib 82 | 83 | # Executables 84 | *.exe 85 | *.out 86 | *.app 87 | 88 | # Visual Studio files 89 | /packages 90 | *.suo 91 | *.user 92 | *.VC.db 93 | *.VC.opendb 94 | 95 | # Protobuf-generated files 96 | *_pb2.py 97 | *.pb.h 98 | *.pb.cc 99 | 100 | # Ray cluster configuration 101 | scripts/nodes.txt 102 | 103 | # OS X folder attributes 104 | .DS_Store 105 | 106 | # Debug files 107 | *.dSYM/ 108 | *.su 109 | 110 | # Python setup files 111 | *.egg-info 112 | 113 | # Compressed files 114 | *.gz 115 | 116 | # Datasets from examples 117 | **/MNIST_data/ 118 | **/cifar-10-batches-bin/ 119 | 120 | # Generated documentation files 121 | /doc/_build 122 | /doc/source/_static/thumbs 123 | /doc/source/tune/generated_guides/ 124 | /doc/source/**/doc/ 125 | 126 | # User-specific stuff: 127 | .idea/**/workspace.xml 128 | .idea/**/tasks.xml 129 | .idea/dictionaries 130 | .llvm-local.bazelrc 131 | 132 | # Sensitive or high-churn files: 133 | .idea/**/dataSources/ 134 | .idea/**/dataSources.ids 135 | .idea/**/dataSources.xml 136 | .idea/**/dataSources.local.xml 137 | .idea/**/sqlDataSources.xml 138 | .idea/**/dynamic.xml 139 | .idea/**/uiDesigner.xml 140 | 141 | # Gradle: 142 | .idea/**/gradle.xml 143 | .idea/**/libraries 144 | .idea 145 | 146 | # Website 147 | /site/Gemfile.lock 148 | /site/.sass-cache 149 | /site/_site 150 | 151 | # Pytest Cache 152 | **/.pytest_cache 153 | **/.cache 154 | .benchmarks 155 | python-driver-* 156 | 157 | # Vscode 158 | .vscode/ 159 | 160 | *.iml 161 | 162 | # Java 163 | java/**/target 164 | java/**/lib 165 | java/**/.settings 166 | java/**/.classpath 167 | java/**/.project 168 | java/runtime/native_dependencies/ 169 | java/testng_custom.xml 170 | 171 | dependency-reduced-pom.xml 172 | 173 | # Cpp 174 | cpp/example/thirdparty/ 175 | 176 | .clwb 177 | 178 | # pom.xml files generated from pom_template.xml 179 | java/**/pom.xml 180 | 181 | # python virtual env 182 | venv 183 | 184 | # pyenv version file 185 | .python-version 186 | 187 | # Vim 188 | .*.swp 189 | *.swp 190 | .*.swo 191 | *.swo 192 | tags 193 | tags.lock 194 | tags.temp 195 | *.vim 196 | 197 | # Emacs 198 | .#* 199 | 200 | # tools 201 | tools/prometheus* 202 | 203 | # ray project files 204 | project-id 205 | .mypy_cache/ 206 | 207 | # release test related 208 | .anyscale.yaml 209 | test_state.json 210 | 211 | # workflow storage 212 | workflow_data/ 213 | 214 | # vscode java extention generated 215 | .factorypath 216 | 217 | # Jupyter Notebooks 218 | **/.ipynb_checkpoints/ 219 | 220 | /external 221 | # Compiled output -> don't check in 222 | /compile_commands.json 223 | # Directory where clangd puts its indexing work 224 | /.cache/ 225 | 226 | # Auto-generated tag mapping 227 | tag-mapping.json 228 | 229 | .bazeliskrc 230 | 231 | # ignore tmp files 232 | *.tmp 233 | deploy/anyscale/service.yaml 234 | out 235 | 236 | # build output 237 | build/ 238 | dist/ 239 | 240 | results/ 241 | aviary-output.json 242 | evaluation-output.json 243 | prompts.txt 244 | hash 245 | __pycache__ 246 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/pipelines/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Tuple 2 | 3 | import torch 4 | from transformers import PreTrainedTokenizer 5 | 6 | from llmadmin.backend.server.models import Prompt 7 | 8 | 9 | def tokenize_string(tokenizer: PreTrainedTokenizer, key: str) -> Union[int, List[int]]: 10 | """Tokenize a string using a tokenizer. 11 | 12 | Args: 13 | tokenizer (PreTrainedTokenizer): Tokenizer to use. 14 | key (str): String to tokenize. 15 | """ 16 | token_ids = tokenizer.encode(key, add_special_tokens=False) 17 | return token_ids[0] if len(token_ids) == 1 else token_ids 18 | 19 | 20 | def decode_tokens(tokenizer: PreTrainedTokenizer, tokens: Union[int, List[int]]) -> str: 21 | tokens = tokens if isinstance(tokens, list) else [tokens] 22 | text = tokenizer.decode(tokens) 23 | return text 24 | 25 | 26 | def truncate_to_first_stop_token( 27 | tokens: torch.LongTensor, 28 | stop_ids: List[Union[int, List[int]]], 29 | ) -> torch.LongTensor: 30 | """Truncate tokens up to the first stop_id. 31 | 32 | Args: 33 | tokens (torch.LongTensor): Tokens to truncate. 34 | stop_ids (List[Union[int, List[int]]]): Stop ids to truncate at. Can be 35 | composed of single stop ids or sequences of ids. 36 | """ 37 | if not stop_ids: 38 | return tokens 39 | stop_ids: List[torch.LongTensor] = [ 40 | torch.LongTensor([stop_id] if not isinstance(stop_id, list) else stop_id) 41 | for stop_id in stop_ids 42 | ] 43 | for i in range(len(tokens)): 44 | for stop_id_index, _ in enumerate(stop_ids): 45 | stop_id = stop_ids[stop_id_index].to(tokens.device) 46 | if len(tokens) - i >= len(stop_id) and tokens[i : len(stop_id) + i].equal( 47 | stop_id 48 | ): 49 | return tokens[:i] 50 | return tokens 51 | 52 | 53 | 54 | def _construct_prompt(prompt: Union[str, Prompt], prompt_format: str) -> str: 55 | if isinstance(prompt, Prompt): 56 | if prompt.use_prompt_format and prompt_format: 57 | return prompt_format.format(instruction=prompt.prompt) 58 | else: 59 | return prompt.prompt 60 | return prompt_format.format(instruction=prompt) if prompt_format else prompt 61 | 62 | def construct_prompts( 63 | prompts: Union[str, Prompt, List[str], List[Prompt], Tuple[str]], 64 | prompt_format: str, 65 | ) -> List[str]: 66 | """Construct prompts from a prompt string or list of prompts.""" 67 | if not isinstance(prompts, list): 68 | prompts = [prompts] 69 | return [_construct_prompt(prompt, prompt_format) for prompt in prompts] 70 | 71 | def construct_prompts_experimental( 72 | prompts: Union[str, Prompt, List[str], List[Prompt], Tuple[str]], 73 | prompt_format: str, 74 | ) -> List[str]: 75 | """Construct prompts from a prompt string or list of prompts.""" 76 | if not isinstance(prompts, list): 77 | prompts = [prompts] 78 | 79 | params = [] 80 | for prompt in prompts: 81 | if isinstance(prompt, Prompt) and isinstance(prompt.prompt, Tuple): 82 | params += [_construct_prompt(prompt, prompt_format) for prompt in prompt.prompt] 83 | else: 84 | params.append(_construct_prompt(prompt, prompt_format)) 85 | return params 86 | 87 | 88 | def tokenize_stopping_sequences_where_needed( 89 | tokenizer: PreTrainedTokenizer, 90 | stopping_sequences: List[Union[str, int, List[int]]], 91 | ) -> List[Union[List[int], int]]: 92 | """If any sequence is a string, tokenize it. 93 | 94 | Args: 95 | tokenizer (PreTrainedTokenizer): Tokenizer to use. 96 | stopping_sequences (List[Union[str, int, List[int]]]): Stopping sequences to 97 | tokenize. Can be ids, sequences of ids or strings. 98 | """ 99 | if not stopping_sequences: 100 | return None 101 | return [ 102 | tokenize_string(tokenizer, sequence) if isinstance(sequence, str) else sequence 103 | for sequence in stopping_sequences 104 | ] 105 | 106 | 107 | def decode_stopping_sequences_where_needed( 108 | tokenizer: PreTrainedTokenizer, 109 | stopping_sequences: List[Union[str, int, List[int]]], 110 | ) -> List[str]: 111 | """If any sequence is a string, tokenize it.""" 112 | if not stopping_sequences: 113 | return None 114 | return [ 115 | decode_tokens(tokenizer, sequence) 116 | if not isinstance(sequence, str) 117 | else sequence 118 | for sequence in stopping_sequences 119 | ] 120 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/tasks/maskedlm_imdb.py: -------------------------------------------------------------------------------- 1 | from ._base import Task 2 | from transformers import AutoModelForMaskedLM 3 | from typing import Any 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | 9 | class MaskedLMImdb(Task): 10 | AUTO_MODEL_CLASS = AutoModelForMaskedLM 11 | 12 | DATASET_PATH = "imdb" 13 | 14 | def get_data_proprocess(self) -> Any: 15 | tokenizer = self.tokenizer 16 | 17 | def group_texts(examples): 18 | # Concatenate all texts 19 | concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} 20 | # Compute length of concatenated texts 21 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 22 | # We drop the last chunk if it's smaller than chunk_size 23 | total_length = (total_length // chunk_size) * chunk_size 24 | # Split by chunks of max_len 25 | result = { 26 | k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] 27 | for k, t in concatenated_examples.items() 28 | } 29 | # Create a new labels column 30 | result["labels"] = result["input_ids"].copy() 31 | return result 32 | 33 | 34 | chunk_size = 128 35 | # adopt python decorator TODO 36 | def preprocess_function(examples: pd.DataFrame): 37 | # examples = examples.to_dict("list") 38 | result = tokenizer(examples["text"]) 39 | if tokenizer.is_fast: 40 | result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] 41 | 42 | tokenized_inputs = group_texts(result) 43 | 44 | # Add back the original columns 45 | ret = {**tokenized_inputs} 46 | return pd.DataFrame.from_dict(ret) 47 | 48 | return preprocess_function 49 | 50 | def get_data_collator(self) -> Any: 51 | import collections 52 | import numpy as np 53 | from transformers import default_data_collator 54 | 55 | wwm_probability = 0.2 56 | tokenizer = self.tokenizer 57 | def whole_word_masking_data_collator(features): 58 | for feature in features: 59 | word_ids = feature.pop("word_ids") 60 | 61 | # Create a map between words and corresponding token indices 62 | mapping = collections.defaultdict(list) 63 | current_word_index = -1 64 | current_word = None 65 | for idx, word_id in enumerate(word_ids): 66 | if word_id is not None: 67 | if word_id != current_word: 68 | current_word = word_id 69 | current_word_index += 1 70 | mapping[current_word_index].append(idx) 71 | 72 | # Randomly mask words 73 | mask = np.random.binomial(1, wwm_probability, (len(mapping),)) 74 | input_ids = feature["input_ids"] 75 | labels = feature["labels"] 76 | new_labels = [-100] * len(labels) 77 | for word_id in np.where(mask)[0]: 78 | word_id = word_id.item() 79 | for idx in mapping[word_id]: 80 | new_labels[idx] = labels[idx] 81 | input_ids[idx] = tokenizer.mask_token_id 82 | feature["labels"] = new_labels 83 | 84 | return default_data_collator(features) 85 | 86 | return whole_word_masking_data_collator 87 | 88 | def get_compute_metrics(self) -> Any: 89 | return None 90 | 91 | def training_key(self): 92 | """ 93 | :return: Iterable[obj] 94 | A iterable of any object, that doc_to_text can handle 95 | """ 96 | return "train" 97 | 98 | def validation_key(self): 99 | """ 100 | :return: Iterable[obj] 101 | A iterable of any object, that doc_to_text can handle 102 | """ 103 | return "test" 104 | 105 | def getTrainDataSet(self): 106 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True) 107 | 108 | def getEvalDataSet(self): 109 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True) 110 | 111 | def getSmallTrainDataSet(self, len: int): 112 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True) 113 | 114 | def getSmallEvalDataSet(self, len: int): 115 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True) -------------------------------------------------------------------------------- /llmadmin/frontend/leaderboard.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pymongo import DESCENDING, MongoClient 3 | 4 | from llmadmin.common.constants import COLLECTION_NAME, DB_NAME, G5_COST_PER_S_IN_DOLLARS 5 | 6 | 7 | class Leaderboard: 8 | def __init__(self, url: str, project_name: str): 9 | self.url = url 10 | self.client = MongoClient(url) 11 | self.db = self.client[DB_NAME] 12 | self.coll = self.db[COLLECTION_NAME] 13 | self.project_name = project_name 14 | 15 | def generate_votes_leaderboard(self) -> pd.DataFrame: 16 | pipeline_votes = [ 17 | {"$match": {"votes": {"$ne": None}}}, 18 | { 19 | "$group": { 20 | "_id": {"llm": "$votes.llm"}, 21 | "Votes": {"$sum": "$votes.score"}, 22 | } 23 | }, 24 | {"$sort": {"count": DESCENDING}}, 25 | { 26 | "$project": { 27 | "LLM": "$_id.llm", 28 | "_id": 0, 29 | "Votes": 1, 30 | } 31 | }, 32 | ] 33 | 34 | pipeline_contentions = [ 35 | {"$match": {"votes": {"$ne": None}}}, 36 | {"$unwind": {"path": "$responses"}}, 37 | { 38 | "$group": { 39 | "_id": {"llm": "$responses.model_id"}, 40 | "In Contention": {"$sum": 1.0}, 41 | } 42 | }, 43 | { 44 | "$project": { 45 | "LLM": "$_id.llm", 46 | "_id": 0, 47 | "In Contention": 1, 48 | } 49 | }, 50 | ] 51 | 52 | df_contentions = pd.DataFrame( 53 | list(self.coll.aggregate(pipeline_contentions)), 54 | columns=["LLM", "In Contention"], 55 | ) 56 | df_votes = pd.DataFrame( 57 | list(self.coll.aggregate(pipeline_votes)), columns=["LLM", "Votes"] 58 | ) 59 | df = pd.merge(df_votes, df_contentions, on="LLM", how="right").fillna(0) 60 | # Use m-estimate correction with prior of 1/3 61 | df["Win Ratio"] = (df["Votes"] + 1) / (df["In Contention"] + 3) * 3 * 1000 62 | df["Win Ratio"] = df["Win Ratio"].astype(int) 63 | df = df.sort_values(by="Win Ratio", ascending=False) 64 | return df 65 | 66 | def generate_perf_leaderboard(self) -> pd.DataFrame: 67 | pipeline = [ 68 | {"$match": {"votes": {"$ne": None}}}, 69 | {"$unwind": {"path": "$responses"}}, 70 | {"$match": {"responses": {"$ne": None}}}, 71 | { 72 | "$group": { 73 | "_id": {"llm": "$responses.model_id"}, 74 | "avg_latency": {"$avg": "$responses.gen_stats.total_time"}, 75 | "avg_length": {"$avg": "$responses.gen_stats.num_total_tokens"}, 76 | } 77 | }, 78 | { 79 | "$project": { 80 | "LLM": "$_id.llm", 81 | "_id": 0, 82 | "Lat (s)": "$avg_latency", 83 | "Tokens (i/o)": "$avg_length", 84 | } 85 | }, 86 | ] 87 | 88 | df = pd.DataFrame( 89 | list(self.coll.aggregate(pipeline)), 90 | columns=["LLM", "Lat (s)", "Tokens (i/o)"], 91 | ) 92 | print(f"Raw DF \n{df}") 93 | df["Tokens/s"] = df["Tokens (i/o)"] / df["Lat (s)"] 94 | df["Cost per answer"] = df["Lat (s)"] * G5_COST_PER_S_IN_DOLLARS 95 | df["CP 1k tokens $"] = 1000 / df["Tokens/s"] * G5_COST_PER_S_IN_DOLLARS 96 | df = df.sort_values(by="Tokens/s", ascending=False) 97 | df = df.round( 98 | { 99 | "Lat (s)": 1, 100 | "Tokens (i/o)": 1, 101 | "Tokens/s": 1, 102 | "Cost per answer": 4, 103 | "CP 1k tokens $": 4, 104 | } 105 | ) 106 | print(df) 107 | return df 108 | 109 | 110 | class DummyLeaderboard(Leaderboard): 111 | def __init__(self, url: str = None, project_name: str = None): 112 | pass 113 | 114 | def generate_votes_leaderboard(self) -> pd.DataFrame: 115 | return pd.DataFrame( 116 | columns=["LLM", "In Contention", "Win Ratio"], 117 | ) 118 | 119 | def generate_perf_leaderboard(self) -> pd.DataFrame: 120 | return pd.DataFrame( 121 | columns=[ 122 | "LLM", 123 | "Lat (s)", 124 | "Tokens (i/o)", 125 | "Tokens/s", 126 | "Cost per answer", 127 | "CP 1k tokens $", 128 | ] 129 | ) 130 | -------------------------------------------------------------------------------- /llmadmin/api/sdk.py: -------------------------------------------------------------------------------- 1 | # from typing import Any, Dict, List 2 | from llmadmin.api.env import assert_has_backend 3 | from ray.serve._private.constants import DEFAULT_HTTP_PORT 4 | from llmadmin.backend.server import run 5 | 6 | 7 | # __all__ = ["models", "metadata", "run"] 8 | 9 | def start_apiserver(port: int = DEFAULT_HTTP_PORT, resource_config: str = None, scale_config: str = None) -> None: 10 | """Run Api server on the local ray cluster 11 | 12 | NOTE: This only works if you are running this command 13 | on the Ray or Anyscale cluster directly. It does not 14 | work from a general machine which only has the url and token 15 | for a model. 16 | """ 17 | assert_has_backend() 18 | run.start_apiserver(port=port, resource_config=resource_config, scale_config=scale_config) 19 | 20 | def run_ft(ft: str) -> None: 21 | """Run LLMAdmin on the local ray cluster 22 | 23 | NOTE: This only works if you are running this command 24 | on the Ray or Anyscale cluster directly. It does not 25 | work from a general machine which only has the url and token 26 | for a model. 27 | """ 28 | assert_has_backend() 29 | run.run_ft(ft) 30 | 31 | def run_ray_ft(ft: str) -> None: 32 | """Run LLMAdmin on the local ray cluster 33 | 34 | NOTE: This only works if you are running this command 35 | on the Ray or Anyscale cluster directly. It does not 36 | work from a general machine which only has the url and token 37 | for a model. 38 | """ 39 | assert_has_backend() 40 | run.run_ray_ft(ft) 41 | 42 | # def models() -> List[str]: 43 | # """List available models""" 44 | # from llmadmin.common.backend import get_llmadmin_backend 45 | 46 | # backend = get_llmadmin_backend() 47 | # return backend.models() 48 | 49 | # def _is_llmadmin_model(model: str) -> bool: 50 | # """ 51 | # Determine if this is an llmadmin model. LLMAdmin 52 | # models do not have a '://' in them. 53 | # """ 54 | # return "://" not in model 55 | 56 | # def _supports_batching(model: str) -> bool: 57 | # provider, _ = model.split("://", 1) 58 | # return provider != "openai" 59 | 60 | # def _convert_to_llmadmin_format(model: str, llm_result): 61 | # generation = llm_result.generations 62 | # result_list = [{"generated_text": x.text} for x in generation[0]] 63 | # return result_list 64 | 65 | # def metadata(model_id: str) -> Dict[str, Dict[str, Any]]: 66 | # """Get model metadata""" 67 | # from llmadmin.common.backend import get_llmadmin_backend 68 | 69 | # backend = get_llmadmin_backend() 70 | # return backend.metadata(model_id) 71 | 72 | # def run(*model: str) -> None: 73 | # """Run LLMAdmin on the local ray cluster 74 | 75 | # NOTE: This only works if you are running this command 76 | # on the Ray or Anyscale cluster directly. It does not 77 | # work from a general machine which only has the url and token 78 | # for a model. 79 | # """ 80 | # assert_has_backend() 81 | # from llmadmin.backend.server.run import run 82 | # run(*model) 83 | 84 | # def run_experimental(*model: str) -> None: 85 | # """Run LLMAdmin on the local ray cluster 86 | 87 | # NOTE: This only works if you are running this command 88 | # on the Ray or Anyscale cluster directly. It does not 89 | # work from a general machine which only has the url and token 90 | # for a model. 91 | # """ 92 | # assert_has_backend() 93 | # from llmadmin.backend.server.run import run_experimental 94 | 95 | # run_experimental(*model) 96 | 97 | # def del_experimental(app_name: str) -> None: 98 | # """Delete ray serve on the local ray cluster 99 | 100 | # NOTE: This only works if you are running this command 101 | # on the Ray or Anyscale cluster directly. It does not 102 | # work from a general machine which only has the url and token 103 | # for a model. 104 | # """ 105 | # assert_has_backend() 106 | # from llmadmin.backend.server.run import del_experimental 107 | 108 | # del_experimental(app_name) 109 | 110 | # def run_application(flow: dict) -> None: 111 | # """Run LLMAdmin on the local ray cluster 112 | 113 | # NOTE: This only works if you are running this command 114 | # on the Ray or Anyscale cluster directly. It does not 115 | # work from a general machine which only has the url and token 116 | # for a model. 117 | # """ 118 | # assert_has_backend() 119 | # from llmadmin.backend.server.run import run_application 120 | 121 | # run_application(flow) 122 | 123 | 124 | # def run_comparation() -> None: 125 | # """Run LLMAdmin on the local ray cluster 126 | 127 | # NOTE: This only works if you are running this command 128 | # on the Ray or Anyscale cluster directly. It does not 129 | # work from a general machine which only has the url and token 130 | # for a model. 131 | # """ 132 | # assert_has_backend() 133 | # from llmadmin.backend.server.run import run_comparation 134 | 135 | # run_comparation() 136 | 137 | 138 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/tasks/tokenclassification_conll2003.py: -------------------------------------------------------------------------------- 1 | from ._base import Task 2 | from transformers import AutoModelForTokenClassification 3 | from typing import Any 4 | import pandas as pd 5 | import evaluate 6 | import numpy as np 7 | from transformers import DataCollatorForTokenClassification 8 | 9 | class TokenclassificationConll2003(Task): 10 | AUTO_MODEL_CLASS = AutoModelForTokenClassification 11 | 12 | DATASET_PATH = "conll2003" 13 | FROM_PRETRAINED_KWARGS = { 14 | "num_labels": 9 15 | } 16 | 17 | def _pre(self) -> Any: 18 | label_names = self.get_dataset()[self.training_key()].features["ner_tags"].feature.names 19 | id2label = {i: label for i, label in enumerate(label_names)} 20 | label2id = {v: k for k, v in id2label.items()} 21 | self.FROM_PRETRAINED_KWARGS["id2label"] = id2label 22 | self.FROM_PRETRAINED_KWARGS["label2id"] = label2id 23 | 24 | def get_data_proprocess(self) -> Any: 25 | tokenizer = self.tokenizer 26 | def align_labels_with_tokens(labels, word_ids): 27 | new_labels = [] 28 | current_word = None 29 | for word_id in word_ids: 30 | if word_id != current_word: 31 | # Start of a new word! 32 | current_word = word_id 33 | label = -100 if word_id is None else labels[word_id] 34 | new_labels.append(label) 35 | elif word_id is None: 36 | # Special token 37 | new_labels.append(-100) 38 | else: 39 | # Same word as previous token 40 | label = labels[word_id] 41 | # If the label is B-XXX we change it to I-XXX 42 | if label % 2 == 1: 43 | label += 1 44 | new_labels.append(label) 45 | 46 | return new_labels 47 | 48 | # adopt python decorator TODO 49 | def preprocess_function(examples: pd.DataFrame): 50 | # examples = examples.to_dict("list") 51 | # inputs = [i.tolist() for i in examples["tokens"]] 52 | inputs = [i for i in examples["tokens"]] 53 | tokenized_inputs = tokenizer( 54 | inputs, truncation=True, is_split_into_words=True 55 | ) 56 | all_labels = examples["ner_tags"] 57 | new_labels = [] 58 | for i, labels in enumerate(all_labels): 59 | word_ids = tokenized_inputs.word_ids(i) 60 | new_labels.append(align_labels_with_tokens(labels, word_ids)) 61 | 62 | tokenized_inputs["labels"] = new_labels 63 | 64 | # Add back the original columns 65 | ret = {**examples, **tokenized_inputs} 66 | return pd.DataFrame.from_dict(ret) 67 | 68 | return preprocess_function 69 | 70 | def get_data_collator(self) -> Any: 71 | data_collator = DataCollatorForTokenClassification(tokenizer=self.tokenizer) 72 | return data_collator 73 | 74 | def get_compute_metrics(self) -> Any: 75 | label_names = self.get_dataset()[self.training_key()].features["ner_tags"].feature.names 76 | metric = evaluate.load("seqeval") 77 | 78 | def compute_metrics(eval_preds): 79 | logits, labels = eval_preds 80 | predictions = np.argmax(logits, axis=-1) 81 | 82 | # Remove ignored index (special tokens) and convert to labels 83 | true_labels = [[label_names[l] for l in label if l != -100] for label in labels] 84 | true_predictions = [ 85 | [label_names[p] for (p, l) in zip(prediction, label) if l != -100] 86 | for prediction, label in zip(predictions, labels) 87 | ] 88 | all_metrics = metric.compute(predictions=true_predictions, references=true_labels) 89 | return { 90 | "precision": all_metrics["overall_precision"], 91 | "recall": all_metrics["overall_recall"], 92 | "f1": all_metrics["overall_f1"], 93 | "accuracy": all_metrics["overall_accuracy"], 94 | } 95 | 96 | return compute_metrics 97 | 98 | def training_key(self): 99 | """ 100 | :return: Iterable[obj] 101 | A iterable of any object, that doc_to_text can handle 102 | """ 103 | return "train" 104 | 105 | def validation_key(self): 106 | """ 107 | :return: Iterable[obj] 108 | A iterable of any object, that doc_to_text can handle 109 | """ 110 | return "validation" 111 | 112 | def getTrainDataSet(self): 113 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True) 114 | 115 | def getEvalDataSet(self): 116 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True) 117 | 118 | def getSmallTrainDataSet(self, len: int): 119 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True) 120 | 121 | def getSmallEvalDataSet(self, len: int): 122 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True) 123 | -------------------------------------------------------------------------------- /llmadmin/backend/server/config.py: -------------------------------------------------------------------------------- 1 | from llmadmin.backend.server.models import LLMApp 2 | 3 | 4 | LLMTEMPLATE_DEPLOYMENT_CONFIG = { 5 | "autoscaling_config":{ 6 | "min_replicas": 0, 7 | "initial_replicas": 1, 8 | "max_replicas": 8, 9 | "target_num_ongoing_requests_per_replica": 1.0, 10 | "metrics_interval_s": 10.0, 11 | "look_back_period_s": 30.0, 12 | "smoothing_factor": 1.0, 13 | "downscale_delay_s": 300.0, 14 | "upscale_delay_s": 90.0, 15 | }, 16 | "ray_actor_options": { 17 | "num_cpus": 0.1 18 | } 19 | } 20 | LLMTEMPLATE_MODEL_CONFIG_COMPARATION = { 21 | "warmup": True, 22 | "model_task": "text-generation", 23 | "model_id": "template", 24 | "max_input_words": 800, 25 | "initialization": { 26 | "runtime_env": { 27 | "pip": ["deepspeed==0.9.2","accelerate"] 28 | }, 29 | "initializer":{ 30 | "type": "SingleDevice", 31 | "dtype": "float32", 32 | "from_pretrained_kwargs":{ 33 | "use_cache": True , 34 | "trust_remote_code": True 35 | } 36 | 37 | }, 38 | "pipeline": "default" 39 | }, 40 | "generation":{ 41 | "max_batch_size": 18, 42 | "generate_kwargs":{ 43 | "do_sample": True, 44 | "max_new_tokens": 128, 45 | "min_new_tokens": 16, 46 | "temperature": 0.7, 47 | "repetition_penalty": 1.1, 48 | "top_p": 0.8, 49 | "top_k": 50, 50 | }, 51 | "prompt_format": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\n{instruction}\n### Response:\n", 52 | "stopping_sequences": ["### Response:", "### End"] 53 | } 54 | } 55 | 56 | # TODO defaulttransformers leverage transformer pipeline to load the model, it's a problem, since some model cannot load by pipeline 57 | LLMTEMPLATE_MODEL_CONFIG_EXPERIMENTAL = { 58 | "warmup": True, 59 | "model_task": "text-generation", 60 | "model_id": "template", 61 | "max_input_words": 800, 62 | "initialization": { 63 | "runtime_env": { 64 | "pip": ["deepspeed==0.9.2","accelerate"] 65 | }, 66 | "initializer":{ 67 | "type": "TransformersPipeline", 68 | "dtype": "float32", 69 | "use_fast": False, 70 | "from_pretrained_kwargs":{ 71 | "use_cache": True , 72 | "trust_remote_code": True 73 | } 74 | 75 | }, 76 | "pipeline": "defaulttransformers" 77 | }, 78 | "generation":{ 79 | "max_batch_size": 18, 80 | "generate_kwargs":{ 81 | "do_sample": True, 82 | "max_new_tokens": 128, 83 | "min_new_tokens": 16, 84 | "temperature": 0.7, 85 | "repetition_penalty": 1.1, 86 | "top_p": 0.8, 87 | "top_k": 50, 88 | }, 89 | "prompt_format": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\n{instruction}\n### Response:\n", 90 | "stopping_sequences": ["### Response:", "### End"] 91 | } 92 | } 93 | 94 | 95 | LLMTEMPLATE_SCALE_CONFIG = { 96 | "num_workers":1, 97 | "num_gpus_per_worker":0.0, 98 | "num_cpus_per_worker":1.0, 99 | "placement_strategy":'PACK', 100 | "resources_per_worker":None, 101 | "pg_timeout_s":600 102 | } 103 | EXPERIMENTAL_LLMTEMPLATE = LLMApp(scaling_config=LLMTEMPLATE_SCALE_CONFIG.copy(),model_config=LLMTEMPLATE_MODEL_CONFIG_EXPERIMENTAL.copy()) 104 | EXPERIMENTAL_LLMTEMPLATE.deployment_config = LLMTEMPLATE_DEPLOYMENT_CONFIG.copy() 105 | 106 | COMPARATION_LLMTEMPLATE = LLMApp(scaling_config=LLMTEMPLATE_SCALE_CONFIG.copy(),model_config=LLMTEMPLATE_MODEL_CONFIG_COMPARATION.copy()) 107 | COMPARATION_LLMTEMPLATE.deployment_config = LLMTEMPLATE_DEPLOYMENT_CONFIG.copy() 108 | 109 | RAY_AGENT_ADDRESS = "http://localhost:52365" 110 | 111 | MODELS_MAPPING = { 112 | "gpt2": "./models/text-generation--gpt2.yaml", 113 | "t5-small": "./models/translation--t5-small.yaml", 114 | "THUDM/chatglm2-6b": "./models/text-generation--THUDM-chatglm2-6b.yaml", 115 | "THUDM/chatglm-6b": "./models/text-generation--THUDM-chatglm-6b.yaml", 116 | "Qwen/Qwen-7B": "./models/text-generation--Qwen--Qwen-7B.yaml", 117 | "Qwen/Qwen-7B-Chat": "./models/text-generation--Qwen--Qwen-7B-Chat.yaml", 118 | "LinkSoul/Chinese-Llama-2-7b": "./models/text-generation--LinkSoul--Chinese-Llama-2-7b.yaml", 119 | "bigscience/bloom-560m": "./models/text-generation--bigscience--bloom-560m.yaml", 120 | "baichuan-inc/Baichuan-7B": "./models/text-generation--baichuan-inc--Baichuan-7B.yaml", 121 | "distilbert-base-uncased-finetuned-sst-2-english": "./models/text-classification--distilbert-base-uncased-finetuned-sst-2-english.yaml", 122 | "facebook/bart-large-cnn": "./models/summarization--facebook--bart-large-cnn.yaml", 123 | "deepset/roberta-base-squad2": "./models/question-answering--deepset--roberta-base-squad2.yaml", 124 | "nlpconnect/vit-gpt2-image-captioning": "./models/image-to-text--nlpconnect--vit-gpt2-image-captioning.yaml" 125 | } 126 | 127 | URL = "http://127.0.0.1:8000/" 128 | SERVE_RUN_HOST = "0.0.0.0" -------------------------------------------------------------------------------- /llmadmin/backend/llm/pipelines/default_pipeline.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import List, Optional, Union 3 | 4 | import torch 5 | from transformers import PreTrainedModel, PreTrainedTokenizer 6 | 7 | from llmadmin.backend.logger import get_logger 8 | from llmadmin.backend.server.models import Response 9 | 10 | from ._base import BasePipeline 11 | from .processors import StopOnTokens 12 | from .utils import construct_prompts, truncate_to_first_stop_token 13 | 14 | logger = get_logger(__name__) 15 | 16 | 17 | class DefaultPipeline(BasePipeline): 18 | """Default text generation pipeline. 19 | 20 | Args: 21 | model (PreTrainedModel): Hugging Face model. 22 | tokenizer (PreTrainedTokenizer): Hugging Face tokenizer. 23 | prompt_format (Optional[str], optional): Prompt format. Defaults to None. 24 | device (Optional[Union[str, int, torch.device]], optional): Device to place model on. Defaults to model's 25 | device. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | model: PreTrainedModel, 31 | tokenizer: PreTrainedTokenizer, 32 | prompt_format: Optional[str] = None, 33 | device: Optional[Union[str, int, torch.device]] = None, 34 | ) -> None: 35 | super().__init__( 36 | model=model, 37 | tokenizer=tokenizer, 38 | prompt_format=prompt_format, 39 | device=device, 40 | ) 41 | 42 | def preprocess(self, prompts: List[str], **generate_kwargs): 43 | st = time.monotonic() 44 | prompt_text = construct_prompts(prompts, prompt_format=self.prompt_format) 45 | instruction_text = construct_prompts(prompts, prompt_format="") 46 | if self.tokenizer.pad_token is None: 47 | self.tokenizer.pad_token = self.tokenizer.eos_token 48 | 49 | inputs = self.tokenizer( 50 | prompt_text, return_tensors="pt", padding=True, **generate_kwargs 51 | ).to(self.model.device) 52 | if not generate_kwargs.get("return_token_type_ids", True): 53 | inputs.pop("token_type_ids", None) 54 | et = time.monotonic() - st 55 | return { 56 | "inputs": inputs, 57 | "instruction_text": instruction_text, 58 | "prompt_text": prompt_text, 59 | "preprocessing_time": et, 60 | } 61 | 62 | def forward(self, model_inputs, **generate_kwargs): 63 | st = time.monotonic() 64 | inputs = model_inputs["inputs"] 65 | instruction_text = model_inputs["instruction_text"] 66 | prompt_text = model_inputs["prompt_text"] 67 | preprocessing_time = model_inputs["preprocessing_time"] 68 | generated_sequence = self.model.generate( 69 | **{ 70 | **inputs, 71 | **generate_kwargs, 72 | } 73 | ) 74 | et = time.monotonic() - st 75 | return { 76 | "inputs": inputs, 77 | "generated_sequence": generated_sequence, 78 | "instruction_text": instruction_text, 79 | "prompt_text": prompt_text, 80 | "preprocessing_time": preprocessing_time, 81 | "generation_time": et, 82 | "generate_kwargs": generate_kwargs, 83 | } 84 | 85 | def postprocess(self, model_outputs, **postprocess_kwargs) -> List[Response]: 86 | st = time.monotonic() 87 | tokens = model_outputs["generated_sequence"] 88 | input_ids = model_outputs["inputs"]["input_ids"] 89 | token_stopper = next( 90 | ( 91 | x 92 | for x in model_outputs["generate_kwargs"].get("stopping_criteria", []) 93 | if isinstance(x, StopOnTokens) 94 | ), 95 | None, 96 | ) 97 | decoded: List[Response] = [] 98 | num_generated_tokens_batch = 0 99 | num_input_tokens_batch = 0 100 | for token_unwrapped, inputs_unwrapped in zip(tokens, input_ids): 101 | logger.info( 102 | f"Unprocessed generated tokens: '{self.tokenizer.decode(token_unwrapped, skip_special_tokens=False).encode('unicode_escape').decode('utf-8')}'" 103 | ) 104 | tokens = token_unwrapped[len(inputs_unwrapped) :] 105 | if token_stopper: 106 | tokens = truncate_to_first_stop_token( 107 | tokens, token_stopper.stopping_sequences 108 | ) 109 | text = ( 110 | self.tokenizer.decode(tokens, skip_special_tokens=True) 111 | .replace("\u200b", "") 112 | .strip() 113 | ) 114 | for i in range(len(inputs_unwrapped)): 115 | if inputs_unwrapped[i] != self.tokenizer.pad_token_id: 116 | break 117 | num_input_tokens = len(inputs_unwrapped[i:]) 118 | num_generated_tokens = len(tokens) 119 | response = Response( 120 | generated_text=text, 121 | num_generated_tokens=num_generated_tokens, 122 | num_input_tokens=num_input_tokens, 123 | ) 124 | num_generated_tokens_batch += num_generated_tokens 125 | num_input_tokens_batch += num_input_tokens 126 | decoded.append(response) 127 | et = time.monotonic() - st 128 | for response in decoded: 129 | response.num_generated_tokens_batch = num_generated_tokens_batch 130 | response.num_input_tokens_batch = num_input_tokens_batch 131 | response.preprocessing_time = model_outputs["preprocessing_time"] 132 | response.generation_time = model_outputs["generation_time"] 133 | response.postprocessing_time = et 134 | return decoded 135 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/pipelines/default_transformers_pipeline.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, List, Optional, Union 2 | 3 | import torch 4 | from transformers import Pipeline as TransformersPipeline 5 | from transformers import PreTrainedModel, PreTrainedTokenizer, pipeline 6 | 7 | from llmadmin.backend.logger import get_logger 8 | from llmadmin.backend.server.models import Prompt, Response 9 | 10 | from ._base import BasePipeline 11 | from .utils import construct_prompts, construct_prompts_experimental 12 | # from llmadmin.backend.server.utils import render_gradio_params 13 | from .default_pipeline import DefaultPipeline 14 | 15 | try: 16 | import transformers 17 | from transformers import pipelines 18 | except ImportError as ie: 19 | raise ImportError( 20 | "transformers not installed. Please try `pip install transformers`" 21 | ) from ie 22 | 23 | if TYPE_CHECKING: 24 | from ..initializers._base import LLMInitializer 25 | 26 | logger = get_logger(__name__) 27 | 28 | 29 | class DefaultTransformersPipeline(BasePipeline): 30 | """Text generation pipeline using Transformers Pipeline. 31 | 32 | May not support all features. 33 | 34 | Args: 35 | model (PreTrainedModel): Hugging Face model. 36 | tokenizer (PreTrainedTokenizer): Hugging Face tokenizer. 37 | prompt_format (Optional[str], optional): Prompt format. Defaults to None. 38 | device (Optional[Union[str, int, torch.device]], optional): Device to place model on. Defaults to model's 39 | device. 40 | """ 41 | 42 | def __init__( 43 | self, 44 | model: PreTrainedModel, 45 | tokenizer: PreTrainedTokenizer, 46 | prompt_format: Optional[str] = None, 47 | device: Optional[Union[str, int, torch.device]] = None, 48 | task: str = None, 49 | ) -> None: 50 | if not hasattr(model, "generate"): 51 | raise ValueError("Model must have a generate method.") 52 | super().__init__(model, tokenizer, prompt_format, device) 53 | 54 | self.pipeline = None 55 | self.preprocess = None 56 | self.postprocess = None 57 | 58 | def _get_transformers_pipeline(self, **kwargs) -> TransformersPipeline: 59 | default_kwargs = dict( 60 | task="text-generation", 61 | model=self.model, 62 | tokenizer=self.tokenizer, 63 | device=None, 64 | ) 65 | transformers_pipe = pipeline(**{**default_kwargs, **kwargs}) 66 | transformers_pipe.device = self.device 67 | return transformers_pipe 68 | 69 | @torch.inference_mode() 70 | def __call__(self, inputs: List[Union[str, Prompt]], **kwargs) -> List[Response]: 71 | if not self.pipeline: 72 | self.pipeline = self._get_transformers_pipeline() 73 | 74 | logger.info(f"input from pipeline: ****** {inputs}") 75 | inputs = construct_prompts_experimental( 76 | inputs, prompt_format=self.prompt_format) 77 | 78 | logger.info(f"input from pipeline: ****** {inputs}") 79 | 80 | if self.preprocess: 81 | data = self.preprocess(inputs) 82 | 83 | logger.info(data) 84 | kwargs.pop("stopping_sequences", None) 85 | kwargs.pop("timeout_s", None) 86 | kwargs.pop("start_timestamp", None) 87 | # special cases that needs to be handled differently 88 | if isinstance( 89 | self.pipeline, 90 | ( 91 | pipelines.text_classification.TextClassificationPipeline, 92 | pipelines.text2text_generation.Text2TextGenerationPipeline, 93 | pipelines.text2text_generation.TranslationPipeline, 94 | ), 95 | ): 96 | data = self.pipeline(*data, **kwargs) 97 | else: 98 | data = self.pipeline(**data, **kwargs) 99 | 100 | logger.info(f"output from pipeline: ****** {data}") 101 | if self.postprocess: 102 | output = self.postprocess(data) 103 | 104 | return output 105 | 106 | @classmethod 107 | def from_initializer( 108 | cls, 109 | initializer: "LLMInitializer", 110 | model_id: str, 111 | prompt_format: Optional[str] = None, 112 | device: Optional[Union[str, int, torch.device]] = None, 113 | stopping_sequences: List[Union[int, str]] = None, 114 | **kwargs, 115 | ) -> "DefaultTransformersPipeline": 116 | model_from_pretrained_kwargs = initializer.get_model_from_pretrained_kwargs() 117 | default_kwargs = dict( 118 | model=model_id, 119 | **kwargs, 120 | **model_from_pretrained_kwargs 121 | ) 122 | 123 | transformers_pipe = pipeline( 124 | **default_kwargs, 125 | model_kwargs=initializer.get_model_init_kwargs(), 126 | ) 127 | # transformers_pipe.model = initializer.postprocess_model(transformers_pipe.model) 128 | pipe = cls( 129 | model=transformers_pipe.model, 130 | tokenizer=transformers_pipe.tokenizer, 131 | prompt_format=prompt_format, 132 | device=device, 133 | # stopping_sequences=stopping_sequences, 134 | **kwargs, 135 | ) 136 | pipe.pipeline = transformers_pipe 137 | transformers_pipe.device = pipe.device 138 | 139 | # if "task" in kwargs: 140 | # pipeline_info = render_gradio_params(kwargs["task"]) 141 | # pipe.preprocess = pipeline_info["preprocess"] 142 | # pipe.postprocess = pipeline_info["postprocess"] 143 | 144 | return pipe 145 | 146 | def preprocess(self, prompts: List[str], **generate_kwargs): 147 | pass 148 | 149 | def forward(self, model_inputs, **generate_kwargs): 150 | pass 151 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/tasks/noheader_AdvertiseGen.py: -------------------------------------------------------------------------------- 1 | from ._base import Task 2 | from transformers import AutoModel, DataCollatorForSeq2Seq 3 | from typing import Any 4 | import pandas as pd 5 | import numpy as np 6 | import jieba 7 | from rouge_chinese import Rouge 8 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 9 | 10 | class NoheaderAdvertiseGen(Task): 11 | AUTO_MODEL_CLASS = AutoModel 12 | 13 | DATASET_PATH = "AdvertiseGen" 14 | 15 | def get_data_proprocess(self) -> Any: 16 | tokenizer = self.tokenizer 17 | max_length = self.ft_config.train_config.base_config.max_length 18 | # adopt python decorator TODO 19 | def preprocess_function(examples: pd.DataFrame): 20 | # examples = examples.to_dict("list") 21 | #-- start 22 | max_source_length = int(max_length / 2) 23 | max_target_length = max_length - max_source_length 24 | # max_seq_length = data_args.max_source_length + data_args.max_target_length 25 | 26 | model_inputs = { 27 | "input_ids": [], 28 | "labels": [], 29 | } 30 | for i in range(len(examples["content"])): 31 | if examples["content"][i] and examples["summary"][i]: 32 | prompt, answer = examples["content"][i], examples["summary"][i] 33 | 34 | a_ids = tokenizer.encode(text=prompt, add_special_tokens=False) 35 | b_ids = tokenizer.encode(text=answer, add_special_tokens=False) 36 | 37 | if len(a_ids) > max_source_length - 1: 38 | a_ids = a_ids[: max_source_length - 1] 39 | 40 | if len(b_ids) > max_target_length - 2: 41 | b_ids = b_ids[: max_target_length - 2] 42 | 43 | input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids) 44 | 45 | context_length = input_ids.index(tokenizer.bos_token_id) 46 | mask_position = context_length - 1 47 | labels = [-100] * context_length + input_ids[mask_position+1:] 48 | 49 | # pad_len = max_length - len(input_ids) 50 | # input_ids = input_ids + [tokenizer.pad_token_id] * pad_len 51 | # labels = labels + [tokenizer.pad_token_id] * pad_len 52 | # if data_args.ignore_pad_token_for_loss: 53 | # labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels] 54 | 55 | model_inputs["input_ids"].append(input_ids) 56 | model_inputs["labels"].append(labels) 57 | 58 | 59 | # Add back the original columns 60 | ret = {**examples, **model_inputs} 61 | return pd.DataFrame.from_dict(ret) 62 | 63 | return preprocess_function 64 | 65 | def get_compute_metrics(self) -> Any: 66 | tokenizer = self.tokenizer 67 | 68 | def compute_metrics(eval_preds): 69 | preds, labels = eval_preds 70 | if isinstance(preds, tuple): 71 | preds = preds[0] 72 | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) 73 | 74 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 75 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 76 | 77 | score_dict = { 78 | "rouge-1": [], 79 | "rouge-2": [], 80 | "rouge-l": [], 81 | "bleu-4": [] 82 | } 83 | for pred, label in zip(decoded_preds, decoded_labels): 84 | hypothesis = list(jieba.cut(pred)) 85 | reference = list(jieba.cut(label)) 86 | rouge = Rouge() 87 | scores = rouge.get_scores(' '.join(hypothesis) , ' '.join(reference)) 88 | result = scores[0] 89 | 90 | for k, v in result.items(): 91 | score_dict[k].append(round(v["f"] * 100, 4)) 92 | bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3) 93 | score_dict["bleu-4"].append(round(bleu_score * 100, 4)) 94 | 95 | for k, v in score_dict.items(): 96 | score_dict[k] = float(np.mean(v)) 97 | return score_dict 98 | 99 | return compute_metrics 100 | 101 | def get_data_collator(self) -> Any: 102 | data_collator = DataCollatorForSeq2Seq( 103 | tokenizer=self.tokenizer, 104 | model=self.model, 105 | label_pad_token_id=-100, 106 | pad_to_multiple_of=None, 107 | padding=True 108 | ) 109 | return data_collator 110 | 111 | def training_key(self): 112 | """ 113 | :return: Iterable[obj] 114 | A iterable of any object, that doc_to_text can handle 115 | """ 116 | return "train" 117 | 118 | def validation_key(self): 119 | """ 120 | :return: Iterable[obj] 121 | A iterable of any object, that doc_to_text can handle 122 | """ 123 | return "validation" 124 | 125 | def getTrainDataSet(self): 126 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True) 127 | 128 | def getEvalDataSet(self): 129 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True) 130 | 131 | def getSmallTrainDataSet(self, len: int): 132 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True) 133 | 134 | def getSmallEvalDataSet(self, len: int): 135 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True) 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # LLM - Finetune 3 | 4 | The framework of training large language models,support lora, full parameters fine tune etc, define yaml to start training/fine tune of your defined models, data and methods. Easy define and easy start. A large-scale model training framework that supports tasks such as LoRA and full-parameter fine-tuning. Easily initiate your large model training and fine-tuning work by defining a YAML file specifying the base model, dataset, and training parameters. Feedback and stars⭐️ are welcome! 5 | 6 | image 7 | 8 | Two steps to run your LLM finetune: 9 | 10 | ## 1. Easy Install 11 | 12 | ### Installation 13 | 14 | Use SHELL `bash` for command 15 | 16 | ```bash 17 | git clone https://github.com/OpenCSGs/llm-finetune.git 18 | cd llm-finetune 19 | pip install . # Install from CN: 'pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple' 20 | ``` 21 | 22 | ## 2. Easy Run 23 | ### Finetune model by command 24 | 25 | ``` 26 | llmfinetune run-ft --ft=./models/ft--sequenceclassification--bert-base-uncased-lora.yaml 27 | ``` 28 | 29 | Your Finetune task is starting now. 30 | 31 | - You can add more yaml for your own to define your tasks. 32 | 33 | ******* 34 | ## (Optional) Addtional launcher: Launch by accelerate or deepspeed 35 | Ensure accelerate and deepspeed are installed, then follow below steps. 36 | 37 | ### Launch by accelerate 38 | 39 | Modify parameters of `accelerate launch` for distributed train. 40 | 41 | #### Finetune on CPU 42 | 43 | ``` 44 | # Use CPU 45 | accelerate launch --cpu --num_machines=1 --num_processes=1 --num_cpu_threads_per_process=1 --mixed_precision=no --dynamo_backend=no llm_finetune.py run-ft --ft=/Users/hub/code/jihulab/opencsg/llm-inference/models/ft--sequenceclassification--bert-base-cased.yaml 46 | ``` 47 | 48 | #### Finetune on GPU on single host 49 | 50 | Control GPU Visibility with `CUDA_VISIBLE_DEVICES`. 51 | 52 | ``` 53 | # Use GPU:0 54 | CUDA_VISIBLE_DEVICES=0 accelerate launch llmfinetune run-ft --ft=./models/ft--sequenceclassification--bert-base-uncased-lora.yaml 55 | 56 | # Use GPU:1 57 | CUDA_VISIBLE_DEVICES=1 accelerate launch llmfinetune run-ft --ft=./models/ft--sequenceclassification--bert-base-uncased-lora.yaml 58 | 59 | # Use GPU:0 60 | accelerate launch --num_machines=1 --num_processes=1 --gpu_ids=0 llmfinetune ... 61 | ``` 62 | 63 | #### Finetune on multi-GPUs on single host 64 | 65 | ``` 66 | # Use all GPUs with mixed precision disabled 67 | accelerate launch --multi_gpu llmfinetune ... 68 | 69 | # Use all GPUs with mxied precision 70 | accelerate launch --multi_gpu --mixed_precision=fp16 llmfinetune ... 71 | 72 | # Use GPU:0 and GPU:1 73 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --multi_gpu --gpu_ids=0,1 llmfinetune ... 74 | 75 | # Launch with 2 GPUs 76 | accelerate launch --multi_gpu --num_processes 2 llmfinetune ... 77 | ``` 78 | 79 | ``` 80 | # Use default_config.yaml 81 | compute_environment: LOCAL_MACHINE 82 | deepspeed_config: {} 83 | distributed_type: MULTI_GPU 84 | downcast_bf16: 'no' 85 | machine_rank: 'NO' 86 | fsdp_config: {} 87 | gpu_ids: all # all GPUs id 88 | machine_rank: 0 89 | main_training_function: main 90 | megatron_lm_config: {} 91 | mixed_precision: fp16 # mixed precsion 92 | num_machines: 1 # a single machine 93 | num_processes: 4 # 4 GPUs 94 | rdzv_backend: static 95 | same_network: true 96 | use_cpu: false 97 | 98 | 99 | accelerate launch --config_file default_config.yaml llmfinetune ... 100 | ``` 101 | 102 | #### Finetune on multi-GPUs on multi-hosts 103 | 104 | All hosts need access without password each other 105 | 106 | ``` 107 | # default_config.yaml 108 | compute_environment: LOCAL_MACHINE 109 | deepspeed_config: 110 | deepspeed_multinode_launcher: standard 111 | gradient_accumulation_steps: 1 112 | gradient_clipping: 1.0 113 | offload_optimizer_device: none 114 | offload_param_device: none 115 | zero3_init_flag: true 116 | zero3_save_16bit_model: true 117 | zero_stage: 3 118 | distributed_type: DEEPSPEED 119 | downcast_bf16: 'no' 120 | dynamo_config: {} 121 | fsdp_config: {} 122 | main_training_function: main 123 | megatron_lm_config: {} 124 | mixed_precision: fp16 125 | num_machines: 2 # 2 nodes 126 | num_processes: 16 # 16 GPUs of all nodes 127 | tpu_env: [] 128 | tpu_use_cluster: false 129 | tpu_use_sudo: false 130 | use_cpu: false 131 | 132 | # Run on all hosts by specify `RANK`, `MASTER_ADDR`, `MASTER_PORT` 133 | accelerate launch --config_file default_config.yaml \ 134 | --machine_rank ${RANK} \ 135 | --main_process_ip ${MASTER_ADDR} \ 136 | --main_process_port ${MASTER_PORT} \ 137 | ... 138 | 139 | # --machine_rank: 0 for the main/master node, for other nodes is 1,2,3 etc. 140 | ``` 141 | 142 | #### Finetune by Deepspeed for multi-GPUs on multi-hosts 143 | 144 | All hosts need access without password each other 145 | 146 | ``` 147 | # myhostfile 148 | node1 slots=1 149 | node2 slots=1 150 | 151 | # deepspeed.json 152 | { 153 | "train_batch_size": "auto", 154 | "train_micro_batch_size_per_gpu": "auto", 155 | "gradient_accumulation_steps": "auto", 156 | "gradient_clipping": "auto", 157 | "zero_allow_untested_optimizer": true, 158 | "fp16": { 159 | "enabled": "auto", 160 | "loss_scale": 0, 161 | "initial_scale_power": 16, 162 | "loss_scale_window": 1000, 163 | "hysteresis": 2, 164 | "min_loss_scale": 1 165 | }, 166 | "zero_optimization": { 167 | "stage": 2, 168 | "allgather_partitions": true, 169 | "allgather_bucket_size": 5e8, 170 | "reduce_scatter": true, 171 | "reduce_bucket_size": 5e8, 172 | "overlap_comm": false, 173 | "contiguous_gradients": true 174 | } 175 | } 176 | 177 | deepspeed --num_nodes=2 --hostfile=myhostfile --deepspeed deepspeed.json ... 178 | 179 | # --num_nodes: num of host 180 | # --hostfile: host file include all hosts IP and num of GPUs 181 | # --deepspeed: deepspeed config file 182 | 183 | ``` 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /llmadmin/common/backend.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from abc import ABC, abstractmethod 4 | from typing import Any, Dict, List, Union 5 | 6 | import requests 7 | 8 | from llmadmin.common.constants import TIMEOUT 9 | 10 | 11 | class BackendError(RuntimeError): 12 | def __init__(self, *args: object, **kwargs) -> None: 13 | self.response = kwargs.pop("response", None) 14 | super().__init__(*args) 15 | 16 | logger = logging.getLogger("ray.logger") 17 | 18 | def get_llmadmin_backend(url: str = "http://127.0.0.1:8000/cmp_models_default"): 19 | """ 20 | Establishes a connection to the LLMAdmin backed after establishing 21 | the information using environmental variables. 22 | If the AVIARY_MOCK environmental variable is set, then a mock backend is used. 23 | 24 | For direct connection to the llmadmin backend (e.g. running on the same cluster), 25 | no AVIARY_TOKEN is required. Otherwise, the AVIARY_URL and AVIARY_TOKEN environmental variables 26 | are required. 27 | 28 | Returns: 29 | backend: An instance of the Backend class. 30 | """ 31 | mock_backend = os.getenv("AVIARY_MOCK", False) 32 | if mock_backend: 33 | backend = MockBackend() 34 | return backend 35 | print(os.getenv("AVIARY_URL")) 36 | llmadmin_url = url 37 | assert llmadmin_url is not None, "AVIARY_URL must be set" 38 | backend_token = os.getenv("AVIARY_TOKEN") 39 | bearer = f"Bearer {backend_token}" if backend_token is not None else "" 40 | if not llmadmin_url.endswith("/"): 41 | llmadmin_url += "/" 42 | print("Connecting to LLMAdmin backend at: ", llmadmin_url) 43 | backend = LLMAdminBackend(llmadmin_url, bearer) 44 | return backend 45 | 46 | 47 | class Backend(ABC): 48 | """Abstract interface for talking to LLMAdmin.""" 49 | 50 | @abstractmethod 51 | def models(self) -> List[str]: 52 | pass 53 | 54 | @abstractmethod 55 | def metadata(self, llm: str) -> Dict[str, Dict[str, Any]]: 56 | pass 57 | 58 | @abstractmethod 59 | def completions(self, prompt: str, llm: str) -> Dict[str, Union[str, float, int]]: 60 | pass 61 | 62 | @abstractmethod 63 | def batch_completions( 64 | self, prompts: List[str], llm: str 65 | ) -> List[Dict[str, Union[str, float, int]]]: 66 | pass 67 | 68 | 69 | class LLMAdminBackend(Backend): 70 | """Interface for talking to LLMAdmin. 71 | Deliberately designed to be similar to OpenAI's 72 | Completions interface. 73 | 74 | https://platform.openai.com/docs/api-reference/completions?lang=python 75 | """ 76 | 77 | def __init__(self, backend_url: str, bearer: str): 78 | assert "::param" not in backend_url, "backend_url not set correctly" 79 | assert "::param" not in bearer, "bearer not set correctly" 80 | 81 | self.backend_url = backend_url 82 | self.bearer = bearer 83 | self.header = {"Authorization": self.bearer} 84 | 85 | def models(self) -> List[str]: 86 | url = self.backend_url + "models" 87 | print("Connecting backend to get models at: ", url) 88 | response = requests.get(url, headers=self.header, timeout=TIMEOUT) 89 | try: 90 | result = response.json() 91 | except requests.JSONDecodeError as e: 92 | raise BackendError( 93 | f"Error decoding JSON from {url}. Text response: {response.text}", 94 | response=response, 95 | ) from e 96 | return result 97 | 98 | def metadata(self, llm: str) -> Dict[str, Dict[str, Any]]: 99 | url = self.backend_url + "metadata/" + llm.replace("/", "--") 100 | response = requests.get(url, headers=self.header, timeout=TIMEOUT) 101 | try: 102 | result = response.json() 103 | except requests.JSONDecodeError as e: 104 | raise BackendError( 105 | f"Error decoding JSON from {url}. Text response: {response.text}", 106 | response=response, 107 | ) from e 108 | return result 109 | 110 | def completions(self, prompt: str, llm: str) -> Dict[str, Union[str, float, int]]: 111 | url = self.backend_url + "query/" + llm.replace("/", "--") 112 | response = requests.post( 113 | url, 114 | headers=self.header, 115 | json={"prompt": prompt}, 116 | timeout=TIMEOUT, 117 | ) 118 | try: 119 | return response.json()[llm] 120 | except requests.JSONDecodeError as e: 121 | raise BackendError( 122 | f"Error decoding JSON from {url}. Text response: {response.text}", 123 | response=response, 124 | ) from e 125 | 126 | def batch_completions( 127 | self, prompts: List[str], llm: str 128 | ) -> List[Dict[str, Union[str, float, int]]]: 129 | url = self.backend_url + "query/batch/" + llm.replace("/", "--") 130 | response = requests.post( 131 | url, 132 | headers=self.header, 133 | json=[{"prompt": prompt} for prompt in prompts], 134 | timeout=TIMEOUT, 135 | ) 136 | try: 137 | return response.json()[llm] 138 | except requests.JSONDecodeError as e: 139 | raise BackendError( 140 | f"Error decoding JSON from {url}. Text response: {response.text}", 141 | response=response, 142 | ) from e 143 | 144 | 145 | class MockBackend(Backend): 146 | """Mock backend for testing""" 147 | 148 | def __init__(self): 149 | pass 150 | 151 | def models(self) -> List[str]: 152 | return ["A", "B", "C"] 153 | 154 | def metadata(self, llm: str) -> Dict[str, Dict[str, Any]]: 155 | return { 156 | "metadata": { 157 | "model_config": { 158 | "model_id": llm, 159 | "model_url": f"https://huggingface.co/org/{llm}", 160 | "model_description": f"This is a model description for model {llm}", 161 | } 162 | } 163 | } 164 | 165 | def completions(self, prompt: str, llm: str) -> Dict[str, Union[str, float, int]]: 166 | return { 167 | "generated_text": prompt, 168 | "total_time": 99, 169 | "num_total_tokens": 42.3, 170 | } 171 | 172 | def batch_completions( 173 | self, prompts: List[str], llm: str 174 | ) -> List[Dict[str, Union[str, float, int]]]: 175 | return [ 176 | { 177 | "generated_text": prompt, 178 | "total_time": 99, 179 | "num_total_tokens": 42.3, 180 | } 181 | for prompt in prompts 182 | ] 183 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/pipelines/llamacpp/llamacpp_pipeline.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union 3 | 4 | import torch 5 | 6 | from llmadmin.backend.logger import get_logger 7 | from llmadmin.backend.server.models import Response 8 | 9 | from ...initializers.llamacpp import LlamaCppInitializer, LlamaCppTokenizer 10 | from .._base import StreamingPipeline 11 | from ..utils import decode_stopping_sequences_where_needed, construct_prompts 12 | 13 | if TYPE_CHECKING: 14 | from llama_cpp import Llama, LogitsProcessorList, StoppingCriteriaList 15 | 16 | logger = get_logger(__name__) 17 | 18 | 19 | class LlamaCppPipeline(StreamingPipeline): 20 | """Text generation pipeline using llama.cpp. 21 | 22 | May not support all features.""" 23 | 24 | def __init__( 25 | self, 26 | model: "Llama", 27 | tokenizer: LlamaCppTokenizer, 28 | prompt_format: Optional[str] = None, 29 | device: Optional[Union[str, int, torch.device]] = None, 30 | **kwargs, 31 | ) -> None: 32 | from llama_cpp import Llama 33 | 34 | if not isinstance(model, Llama): 35 | raise TypeError("Model must be an instance of llama_cpp.Llama.") 36 | self.model = model 37 | self.kwargs = kwargs 38 | self.tokenizer = tokenizer 39 | self.device = device 40 | self.prompt_format = prompt_format 41 | 42 | def _get_logits_processors( 43 | self, generate_kwargs: Dict[str, Any], model_inputs=None 44 | ) -> "LogitsProcessorList": 45 | from llama_cpp import LogitsProcessorList 46 | 47 | from llmadmin.backend.llm.pipelines.llamacpp.processors import ( 48 | LlamaCppMinNewTokensLengthLogitsProcessor, 49 | ) 50 | 51 | lst = [] 52 | 53 | if "min_new_tokens" in generate_kwargs: 54 | lst.append( 55 | LlamaCppMinNewTokensLengthLogitsProcessor( 56 | prompt_length_to_skip=len(model_inputs["tokenized_inputs"]), 57 | min_new_tokens=generate_kwargs.pop("min_new_tokens", 4), 58 | eos_token_id=self.model.token_eos(), 59 | ) 60 | ) 61 | 62 | return LogitsProcessorList(lst) 63 | 64 | def _get_stopping_criteria( 65 | self, generate_kwargs: Dict[str, Any], model_inputs=None 66 | ) -> "StoppingCriteriaList": 67 | from llama_cpp import StoppingCriteriaList 68 | 69 | from llmadmin.backend.llm.pipelines.llamacpp.processors import ( 70 | LlamaMaxTimeCriteria, 71 | ) 72 | 73 | lst = [] 74 | 75 | timeout_s = generate_kwargs.pop("timeout_s", None) 76 | start_timestamp = generate_kwargs.pop("start_timestamp", None) 77 | if timeout_s is not None and start_timestamp is not None: 78 | lst.append(LlamaMaxTimeCriteria(timeout_s, start_timestamp)) 79 | 80 | return StoppingCriteriaList(lst) 81 | 82 | def _add_default_generate_kwargs( 83 | self, generate_kwargs: Dict[str, Any], model_inputs=None 84 | ) -> Dict[str, Any]: 85 | generate_kwargs = generate_kwargs.copy() 86 | generate_kwargs.setdefault("echo", False) 87 | stopping_sequences = generate_kwargs.pop("stopping_sequences") 88 | stopping_sequences = decode_stopping_sequences_where_needed( 89 | self.tokenizer, stopping_sequences 90 | ) 91 | generate_kwargs.setdefault("stop", stopping_sequences) 92 | generate_kwargs["logits_processor"] = self._get_logits_processors( 93 | generate_kwargs, model_inputs=model_inputs 94 | ) 95 | generate_kwargs["stopping_criteria"] = self._get_stopping_criteria( 96 | generate_kwargs, model_inputs=model_inputs 97 | ) 98 | return generate_kwargs 99 | 100 | def __call__(self, inputs: List[str], **kwargs) -> List[Response]: 101 | logger.info(inputs) 102 | inputs = construct_prompts( 103 | inputs, prompt_format=self.prompt_format) 104 | 105 | logger.info(inputs) 106 | tokenized_inputs = self.tokenizer.encode(inputs[0]) 107 | kwargs = self._add_default_generate_kwargs( 108 | kwargs, 109 | model_inputs={"inputs": inputs, "tokenized_inputs": tokenized_inputs}, 110 | ) 111 | 112 | logger.info(f"Forward params: {kwargs}, model_inputs {inputs}") 113 | responses = [] 114 | for input in inputs: 115 | st = time.monotonic() 116 | output = self.model(input, **kwargs) 117 | gen_time = time.monotonic() - st 118 | text = output["choices"][0]["text"].replace("\u200b", "").strip() 119 | responses.append( 120 | Response( 121 | generated_text=text, 122 | num_generated_tokens=output["usage"]["completion_tokens"], 123 | num_input_tokens=output["usage"]["prompt_tokens"], 124 | num_generated_tokens_batch=output["usage"]["completion_tokens"], 125 | num_input_tokens_batch=output["usage"]["prompt_tokens"], 126 | preprocessing_time=None, 127 | postprocessing_time=None, 128 | generation_time=gen_time, 129 | ) 130 | ) 131 | return responses 132 | 133 | def stream( 134 | self, 135 | inputs: List[str], 136 | **kwargs, 137 | ) -> Iterator[torch.LongTensor]: 138 | tokenized_inputs = self.tokenizer.encode(inputs[0]) 139 | kwargs = self._add_default_generate_kwargs( 140 | kwargs, 141 | model_inputs={"inputs": inputs, "tokenized_inputs": tokenized_inputs}, 142 | ) 143 | 144 | logger.info(f"Forward params: {kwargs}, model_inputs {inputs}") 145 | first_token_done = False 146 | for input in inputs: 147 | for output in self.model(input, stream=True, **kwargs): 148 | st = time.monotonic() 149 | gen_time = time.monotonic() - st 150 | text = output["choices"][0]["text"].replace("\u200b", "") 151 | if not first_token_done: 152 | text = text.lstrip() 153 | first_token_done = True 154 | yield [ 155 | Response( 156 | generated_text=text, 157 | num_generated_tokens=1, 158 | num_input_tokens=len(tokenized_inputs), 159 | num_generated_tokens_batch=1, 160 | num_input_tokens_batch=len(tokenized_inputs), 161 | preprocessing_time=None, 162 | postprocessing_time=None, 163 | generation_time=gen_time, 164 | ) 165 | ] 166 | 167 | def preprocess(self, prompts: List[str], **generate_kwargs): 168 | pass 169 | 170 | def forward(self, model_inputs, **generate_kwargs): 171 | pass 172 | 173 | @classmethod 174 | def from_initializer( 175 | cls, 176 | initializer: "LlamaCppInitializer", 177 | model_id: str, 178 | device: Optional[Union[str, int, torch.device]] = None, 179 | **kwargs, 180 | ) -> "LlamaCppPipeline": 181 | assert isinstance(initializer, LlamaCppInitializer) 182 | logger.info(f"LlamaCppPipeline initializer loading model: {model_id}") 183 | model, tokenizer = initializer.load(model_id) 184 | logger.info(f"LlamaCppPipeline loaded model: {model}") 185 | return cls( 186 | model, 187 | tokenizer, 188 | device=device, 189 | **kwargs, 190 | ) -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/tasks/text_generation_AdvertiseGen.py: -------------------------------------------------------------------------------- 1 | from ._base import Task 2 | from transformers import AutoModel, DataCollatorForSeq2Seq, AutoModelForCausalLM 3 | from typing import Any 4 | import pandas as pd 5 | import numpy as np 6 | import jieba 7 | from rouge_chinese import Rouge 8 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 9 | 10 | class NoheaderAdvertiseGen(Task): 11 | # AUTO_MODEL_CLASS = AutoModel 12 | AUTO_MODEL_CLASS = AutoModelForCausalLM 13 | 14 | DATASET_PATH = "AdvertiseGen" 15 | prompt_column = "content" 16 | response_column = "summary" 17 | # history_column = "history" 18 | 19 | def get_data_proprocess(self) -> Any: 20 | self.prompt_column = self.ft_config.data_config.input_columns[0] 21 | self.response_column = self.ft_config.data_config.validation_column 22 | self.DATASET_PATH = self.ft_config.data_config.data_path 23 | tokenizer = self.tokenizer 24 | max_length = self.ft_config.train_config.base_config.max_length 25 | # adopt python decorator TODO 26 | def preprocess_function(examples): 27 | # examples = examples.to_dict("list") 28 | #-- start 29 | max_source_length = int(max_length / 2) 30 | max_target_length = max_length - max_source_length 31 | max_source_length = 64 32 | max_target_length = 128 33 | max_seq_length = max_source_length + max_target_length + 1 34 | 35 | model_inputs = { 36 | "input_ids": [], 37 | "labels": [], 38 | } 39 | prefix = "" 40 | for i in range(len(examples[self.prompt_column])): 41 | if examples[self.prompt_column][i] and examples[self.response_column][i]: 42 | query, answer = examples[self.prompt_column][i], examples[self.response_column][i] 43 | 44 | # history = examples[history_column][i] if history_column is not None else None 45 | # history = None 46 | # prompt = tokenizer.build_prompt(query, history) 47 | 48 | prompt = prefix + query 49 | print(f"tokenizer is: {tokenizer}") 50 | a_ids = tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True, padding=True, 51 | max_length=max_source_length) 52 | b_ids = tokenizer.encode(text=answer, add_special_tokens=False, truncation=True, padding=True, 53 | max_length=max_target_length) 54 | 55 | context_length = len(a_ids) 56 | input_ids = a_ids + b_ids + [tokenizer.eos_token_id] 57 | labels = [tokenizer.pad_token_id] * context_length + b_ids + [tokenizer.eos_token_id] 58 | 59 | pad_len = max_seq_length - len(input_ids) 60 | input_ids = input_ids + [tokenizer.pad_token_id] * pad_len 61 | labels = labels + [tokenizer.pad_token_id] * pad_len 62 | 63 | # if data_args.ignore_pad_token_for_loss: 64 | # labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels] 65 | # labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels] 66 | 67 | model_inputs["input_ids"].append(input_ids) 68 | model_inputs["labels"].append(labels) 69 | 70 | return model_inputs 71 | 72 | return preprocess_function 73 | 74 | def get_eval_preprocess(self) -> Any: 75 | tokenizer = self.tokenizer 76 | def preprocess_function_eval(examples): 77 | max_source_length = 64 78 | max_target_length = 128 79 | inputs, targets = [], [] 80 | prefix = "" 81 | for i in range(len(examples[self.prompt_column])): 82 | if examples[self.prompt_column][i] and examples[self.response_column][i]: 83 | query = examples[self.prompt_column][i] 84 | # history = examples[history_column][i] if history_column is not None else None 85 | # history = None 86 | # prompt = tokenizer.build_prompt(query, history) 87 | inputs.append(query) 88 | targets.append(examples[self.response_column][i]) 89 | 90 | inputs = [prefix + inp for inp in inputs] 91 | model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True, padding=True) 92 | labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True) 93 | 94 | # if data_args.ignore_pad_token_for_loss: 95 | # labels["input_ids"] = [ 96 | # [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] 97 | # ] 98 | model_inputs["labels"] = labels["input_ids"] 99 | 100 | return model_inputs 101 | 102 | return preprocess_function_eval 103 | 104 | def get_compute_metrics(self) -> Any: 105 | tokenizer = self.tokenizer 106 | 107 | def compute_metrics(eval_preds): 108 | preds, labels = eval_preds 109 | if isinstance(preds, tuple): 110 | preds = preds[0] 111 | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) 112 | 113 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 114 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 115 | 116 | score_dict = { 117 | "rouge-1": [], 118 | "rouge-2": [], 119 | "rouge-l": [], 120 | "bleu-4": [] 121 | } 122 | for pred, label in zip(decoded_preds, decoded_labels): 123 | hypothesis = list(jieba.cut(pred)) 124 | reference = list(jieba.cut(label)) 125 | rouge = Rouge() 126 | scores = rouge.get_scores(' '.join(hypothesis) , ' '.join(reference)) 127 | result = scores[0] 128 | 129 | for k, v in result.items(): 130 | score_dict[k].append(round(v["f"] * 100, 4)) 131 | bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3) 132 | score_dict["bleu-4"].append(round(bleu_score * 100, 4)) 133 | 134 | for k, v in score_dict.items(): 135 | score_dict[k] = float(np.mean(v)) 136 | return score_dict 137 | 138 | return compute_metrics 139 | 140 | def get_data_collator(self) -> Any: 141 | # label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id 142 | label_pad_token_id = self.tokenizer.pad_token_id 143 | data_collator = DataCollatorForSeq2Seq( 144 | tokenizer=self.tokenizer, 145 | model=self.model, 146 | label_pad_token_id=label_pad_token_id, 147 | pad_to_multiple_of=None, 148 | # padding=True 149 | padding=False 150 | ) 151 | return data_collator 152 | 153 | def training_key(self): 154 | """ 155 | :return: Iterable[obj] 156 | A iterable of any object, that doc_to_text can handle 157 | """ 158 | return "train" 159 | 160 | def validation_key(self): 161 | """ 162 | :return: Iterable[obj] 163 | A iterable of any object, that doc_to_text can handle 164 | """ 165 | return "validation" 166 | 167 | def getTrainDataSet(self): 168 | return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column]) 169 | 170 | def getEvalDataSet(self): 171 | return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column]) 172 | 173 | def getSmallTrainDataSet(self, len: int): 174 | return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column]) 175 | 176 | def getSmallEvalDataSet(self, len: int): 177 | return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column]) 178 | # return self.dataset[self.validation_key()].select(range(len)).map(self.get_eval_preprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column]) 179 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/ray_train.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import numpy as np 4 | import torch 5 | from ._base import BaseFT 6 | from llmadmin.backend.server.models import FTApp 7 | 8 | from datasets import load_dataset 9 | from transformers import AutoTokenizer 10 | import ray.data 11 | import torch 12 | import numpy as np 13 | 14 | from datasets import load_metric 15 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer 16 | 17 | import ray.train 18 | from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback 19 | from ray.train.torch import TorchTrainer 20 | from ray.train import RunConfig, ScalingConfig, CheckpointConfig, FailureConfig 21 | from llmadmin.backend.logger import get_logger 22 | 23 | logger = get_logger(__name__) 24 | 25 | # GLUE_TASKS = [ 26 | # "cola", 27 | # "mnli", 28 | # "mnli-mm", 29 | # "mrpc", 30 | # "qnli", 31 | # "qqp", 32 | # "rte", 33 | # "sst2", 34 | # "stsb", 35 | # "wnli", 36 | # ] 37 | 38 | class RayTrain(BaseFT): 39 | 40 | def __init__(self, ftApp: FTApp): 41 | self.init_model_dataset() 42 | super().__init__(ftapp=ftApp) 43 | 44 | def init_model_dataset(self): 45 | self.use_gpu = False # set this to False to run on CPUs 46 | self.num_workers = 2 # set this to number of GPUs or CPUs you want to use 47 | logger.info(f"Is CUDA available: {torch.cuda.is_available()}") 48 | logger.info(f"init model and dataset with num_workers={self.num_workers}, use_gpu={self.use_gpu}") 49 | self.task_to_keys = { 50 | "cola": ("sentence", None), 51 | "mnli": ("premise", "hypothesis"), 52 | "mnli-mm": ("premise", "hypothesis"), 53 | "mrpc": ("sentence1", "sentence2"), 54 | "qnli": ("question", "sentence"), 55 | "qqp": ("question1", "question2"), 56 | "rte": ("sentence1", "sentence2"), 57 | "sst2": ("sentence", None), 58 | "stsb": ("sentence1", "sentence2"), 59 | "wnli": ("sentence1", "sentence2"), 60 | } 61 | self.task = "cola" 62 | self.actual_task = "mnli" if self.task == "mnli-mm" else self.task 63 | self.model_checkpoint = "/Users/hhwang/models/distilbert-base-uncased" 64 | 65 | logger.info(f"begin load model {self.model_checkpoint}") 66 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint, use_fast=True) 67 | self.num_labels = 3 if self.task.startswith("mnli") else 1 if self.task == "stsb" else 2 68 | self.batch_size = 2 69 | 70 | dataset_path = "glue" 71 | logger.info(f"begin load dataset {dataset_path} -> {self.actual_task}") 72 | datasets = load_dataset(dataset_path, self.actual_task) 73 | logger.info(f"loaded datasets: {datasets}") 74 | item_count = 20 75 | logger.info(f"convert {item_count} records to ray dataset") 76 | self.ray_datasets = { 77 | "train": ray.data.from_huggingface(datasets["train"].select(range(item_count))), 78 | "validation": ray.data.from_huggingface(datasets["validation"].select(range(item_count))), 79 | "test": ray.data.from_huggingface(datasets["test"].select(range(item_count))), 80 | } 81 | self.train_count = self.ray_datasets["train"].count() 82 | self.validation_count = self.ray_datasets["validation"].count() 83 | self.test_count = self.ray_datasets["test"].count() 84 | logger.info(f"dataset train count: {self.train_count}") 85 | logger.info(f"dataset validation count: {self.validation_count}") 86 | logger.info(f"dataset test count: {self.test_count}") 87 | model_name = self.model_checkpoint.split("/")[-1] 88 | self.name = f"{model_name}-finetuned-{self.task}" 89 | logger.info(f"output model dir: {self.name}") 90 | 91 | # Tokenize input sentences 92 | def collate_fn(self, examples: Dict[str, np.array]): 93 | sentence1_key, sentence2_key = self.task_to_keys[self.task] 94 | if sentence2_key is None: 95 | outputs = self.tokenizer( 96 | list(examples[sentence1_key]), 97 | truncation=True, 98 | padding="longest", 99 | return_tensors="pt", 100 | ) 101 | else: 102 | outputs = self.tokenizer( 103 | list(examples[sentence1_key]), 104 | list(examples[sentence2_key]), 105 | truncation=True, 106 | padding="longest", 107 | return_tensors="pt", 108 | ) 109 | outputs["labels"] = torch.LongTensor(examples["label"]) 110 | 111 | if self.use_gpu: 112 | # Move all input tensors to GPU 113 | for key, value in outputs.items(): 114 | outputs[key] = value.cuda() 115 | 116 | return outputs 117 | 118 | def train_func(self, config): 119 | # Calculate the maximum steps per epoch based on the number of rows in the training dataset. 120 | # Make sure to scale by the total number of training workers and the per device batch size. 121 | max_steps_per_epoch = self.ray_datasets["train"].count() // (self.batch_size * self.num_workers) 122 | logger.info(f"max_steps_per_epoch: {max_steps_per_epoch}, batch_size: {self.batch_size}, num_workers: {self.num_workers}") 123 | 124 | # metric = load_metric("glue", self.actual_task) 125 | tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint, use_fast=True) 126 | model = AutoModelForSequenceClassification.from_pretrained( 127 | self.model_checkpoint, num_labels=self.num_labels 128 | ) 129 | 130 | train_ds = ray.train.get_dataset_shard("train") 131 | eval_ds = ray.train.get_dataset_shard("eval") 132 | 133 | train_ds_iterable = train_ds.iter_torch_batches( 134 | batch_size=self.batch_size, collate_fn=self.collate_fn 135 | ) 136 | eval_ds_iterable = eval_ds.iter_torch_batches( 137 | batch_size=self.batch_size, collate_fn=self.collate_fn 138 | ) 139 | 140 | args = TrainingArguments( 141 | self.name, 142 | evaluation_strategy="epoch", 143 | save_strategy="epoch", 144 | logging_strategy="epoch", 145 | per_device_train_batch_size=self.batch_size, 146 | per_device_eval_batch_size=self.batch_size, 147 | learning_rate=config.get("learning_rate", 2e-5), 148 | num_train_epochs=config.get("epochs", 2), 149 | weight_decay=config.get("weight_decay", 0.01), 150 | push_to_hub=False, 151 | max_steps=max_steps_per_epoch * config.get("epochs", 2), 152 | disable_tqdm=True, # declutter the output a little 153 | use_cpu=not self.use_gpu, # you need to explicitly set no_cuda if you want CPUs 154 | report_to="none", 155 | ) 156 | 157 | # def compute_metrics(eval_pred): 158 | # predictions, labels = eval_pred 159 | # if self.task != "stsb": 160 | # predictions = np.argmax(predictions, axis=1) 161 | # else: 162 | # predictions = predictions[:, 0] 163 | # return metric.compute(predictions=predictions, references=labels) 164 | 165 | trainer = Trainer( 166 | model, 167 | args, 168 | train_dataset=train_ds_iterable, 169 | eval_dataset=eval_ds_iterable, 170 | tokenizer=tokenizer, 171 | # compute_metrics=compute_metrics, 172 | ) 173 | 174 | trainer.add_callback(RayTrainReportCallback()) 175 | 176 | trainer = prepare_trainer(trainer) 177 | 178 | logger.info("Starting training") 179 | trainer.train() 180 | 181 | def train(self): 182 | # metric_name = ( 183 | # "pearson" 184 | # if self.task == "stsb" 185 | # else "matthews_correlation" 186 | # if self.task == "cola" 187 | # else "accuracy" 188 | # ) 189 | 190 | # validation_key = ( 191 | # "validation_mismatched" 192 | # if self.task == "mnli-mm" 193 | # else "validation_matched" 194 | # if self.task == "mnli" 195 | # else "validation" 196 | # ) 197 | logger.info(f"build ray TorchTrainer") 198 | 199 | trainer = TorchTrainer( 200 | self.train_func, 201 | scaling_config=ScalingConfig(num_workers=self.num_workers, use_gpu=self.use_gpu), 202 | datasets={ 203 | "train": self.ray_datasets["train"], 204 | "eval": self.ray_datasets["validation"], 205 | }, 206 | run_config=RunConfig( 207 | checkpoint_config=CheckpointConfig( 208 | num_to_keep=1, 209 | checkpoint_score_attribute="eval_loss", 210 | checkpoint_score_order="min", 211 | ), 212 | failure_config=FailureConfig( 213 | max_failures=5 214 | ) 215 | ), 216 | ) 217 | 218 | logger.info(f"begin ray train fit") 219 | result = trainer.fit() 220 | logger.info(f"end ray train fit") 221 | logger.info(f"result: {result}") 222 | 223 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/initializers/hf_transformers/deepspeed.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | from typing import Any, Dict, Optional, Tuple 5 | 6 | import deepspeed 7 | import torch 8 | from huggingface_hub import snapshot_download 9 | from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedModel 10 | 11 | from llmadmin.backend.logger import get_logger 12 | 13 | from .base import TransformersInitializer 14 | 15 | logger = get_logger(__name__) 16 | 17 | 18 | # TODO: Allow deepspeed kwargs 19 | class DeepSpeedInitializer(TransformersInitializer): 20 | """Initialize model (with DeepSpeed) and tokenizer and place them on the correct device. 21 | 22 | Args: 23 | device (torch.device): Device to place model and tokenizer on. 24 | world_size (int): Number of GPUs to use. 25 | dtype (torch.dtype, optional): Data type to use. Defaults to torch.float16. 26 | use_bettertransformer (bool, optional): Whether to use BetterTransformer. Defaults to False. 27 | torch_compile (Optional[Dict[str, Any]], optional): Parameters for ``torch.compile``. Defaults to None. 28 | max_tokens (int, optional): Maximum number of tokens to use. Defaults to 1024. 29 | use_kernel (bool, optional): Whether to use the DeepSpeed kernel injection. Defaults to False. 30 | use_meta_tensor (bool, optional): Whether to use meta tensor loading method. Defaults to False. 31 | injection_policy ([type], optional): Injection policy for DeepSpeed AutoTP. Cannot 32 | be set if use_kernel=True. Defaults to None. 33 | ds_inference_kwargs (Dict[str, Any], optional): Other keyword arguments for ``deepspeed.initialize``. 34 | Specific arguments in the signature of this function will override these values. 35 | **from_pretrained_kwargs: Keyword arguments for ``AutoModel.from_pretrained``. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | device: torch.device, 41 | world_size: int, 42 | dtype: torch.dtype = torch.float16, 43 | use_bettertransformer: bool = False, 44 | torch_compile: Optional[Dict[str, Any]] = None, 45 | max_tokens: int = 1024, 46 | use_kernel: bool = False, 47 | use_meta_tensor: bool = False, 48 | injection_policy=None, 49 | ds_inference_kwargs: Optional[Dict[str, Any]] = None, 50 | **from_pretrained_kwargs, 51 | ): 52 | super().__init__( 53 | device=device, 54 | world_size=world_size, 55 | dtype=dtype, 56 | use_bettertransformer=use_bettertransformer, 57 | torch_compile=torch_compile, 58 | **from_pretrained_kwargs, 59 | ) 60 | self.max_tokens = max_tokens 61 | self.use_kernel = use_kernel 62 | self.use_meta_tensor = use_meta_tensor 63 | # TODO: Allow conversion from strings (need to do dynamic imports) 64 | self.injection_policy = injection_policy 65 | self.ds_inference_kwargs = ds_inference_kwargs 66 | 67 | if self.use_kernel: 68 | assert not (self.use_bettertransformer or self.torch_compile) 69 | 70 | if self.use_meta_tensor: 71 | assert self.use_kernel 72 | 73 | def _get_model_from_pretrained_kwargs(self): 74 | return dict( 75 | low_cpu_mem_usage=True, 76 | torch_dtype=self.dtype, 77 | **self.from_pretrained_kwargs, 78 | ) 79 | 80 | # From https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/utils.py 81 | def _generate_checkpoint_json( 82 | self, model_id: str, checkpoint_path: Optional[str] = None 83 | ) -> Tuple[str, str]: 84 | if checkpoint_path is None: 85 | repo_root = snapshot_download( 86 | model_id, 87 | allow_patterns=["*"], 88 | ignore_patterns=["*.safetensors", "*.h5", "*.msgpack"], 89 | local_files_only=False, 90 | revision=None, 91 | ) 92 | else: 93 | assert os.path.exists( 94 | checkpoint_path 95 | ), f"Checkpoint path {checkpoint_path} does not exist" 96 | repo_root = checkpoint_path 97 | 98 | if os.path.exists(os.path.join(repo_root, "ds_inference_config.json")): 99 | checkpoints_json = os.path.join(repo_root, "ds_inference_config.json") 100 | elif model_id in [ 101 | "microsoft/bloom-deepspeed-inference-int8", 102 | "microsoft/bloom-deepspeed-inference-fp16", 103 | ]: 104 | # tp presharded repos come with their own checkpoints config file 105 | checkpoints_json = os.path.join(repo_root, "ds_inference_config.json") 106 | else: 107 | checkpoints_json = os.path.join(repo_root, "checkpoints.json") 108 | 109 | with open(checkpoints_json, "w", encoding="utf-8") as f: 110 | file_list = [ 111 | str(entry).split("/")[-1] 112 | for entry in Path(repo_root).rglob("*.[bp][it][n]") 113 | if entry.is_file() 114 | ] 115 | data = {"type": "BLOOM", "checkpoints": file_list, "version": 1.0} 116 | json.dump(data, f) 117 | 118 | return os.path.abspath(repo_root), os.path.abspath(checkpoints_json) 119 | 120 | def load_model(self, model_id: str) -> "PreTrainedModel": 121 | model_id_or_path = self._get_model_location_on_disk(model_id) 122 | from_pretrained_kwargs = self._get_model_from_pretrained_kwargs() 123 | 124 | logger.info(f"Loading model {model_id_or_path}...") 125 | if self.use_meta_tensor: 126 | logger.info("Loading model using DeepSpeed meta tensor...") 127 | 128 | try: 129 | config = AutoConfig.from_pretrained( 130 | model_id_or_path, **from_pretrained_kwargs 131 | ) 132 | except OSError: 133 | if model_id_or_path != model_id: 134 | logger.warning( 135 | f"Couldn't load model from derived path {model_id_or_path}, " 136 | f"trying to load from model_id {model_id}" 137 | ) 138 | config = AutoConfig.from_pretrained( 139 | model_id, **from_pretrained_kwargs 140 | ) 141 | else: 142 | raise 143 | 144 | self._repo_root, self._checkpoints_json = self._generate_checkpoint_json( 145 | model_id 146 | ) 147 | 148 | with deepspeed.OnDevice(dtype=torch.float16, device="meta"): 149 | model = AutoModelForCausalLM.from_config(config) 150 | else: 151 | try: 152 | model = AutoModelForCausalLM.from_pretrained( 153 | model_id_or_path, **from_pretrained_kwargs 154 | ) 155 | except OSError: 156 | if model_id_or_path != model_id: 157 | logger.warning( 158 | f"Couldn't load model from derived path {model_id_or_path}, " 159 | f"trying to load from model_id {model_id}" 160 | ) 161 | model = AutoModelForCausalLM.from_pretrained( 162 | model_id, **from_pretrained_kwargs 163 | ) 164 | else: 165 | raise 166 | model.eval() 167 | return model 168 | 169 | def postprocess_model(self, model: "PreTrainedModel") -> "PreTrainedModel": 170 | from transformers import GPTNeoXForCausalLM, LlamaForCausalLM 171 | 172 | injection_policy = self.injection_policy 173 | # TODO: remove those later when deepspeed master is updated 174 | if injection_policy is None and not self.use_kernel: 175 | if isinstance(model, GPTNeoXForCausalLM): 176 | from transformers import GPTNeoXLayer 177 | 178 | injection_policy = { 179 | GPTNeoXLayer: ("attention.dense", "mlp.dense_4h_to_h") 180 | } 181 | elif isinstance(model, LlamaForCausalLM): 182 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer 183 | 184 | injection_policy = { 185 | LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj") 186 | } 187 | 188 | if self.use_bettertransformer: 189 | from optimum.bettertransformer import BetterTransformer 190 | 191 | logger.info("Transforming the model with BetterTransformer...") 192 | model = BetterTransformer.transform(model) 193 | 194 | ds_kwargs = self.ds_inference_kwargs or {} 195 | ds_kwargs = ds_kwargs.copy() 196 | ds_kwargs.update( 197 | dict( 198 | dtype=self.dtype, 199 | mp_size=self.world_size, 200 | replace_with_kernel_inject=self.use_kernel, 201 | injection_policy=injection_policy, 202 | max_tokens=self.max_tokens, 203 | ) 204 | ) 205 | if self.use_meta_tensor: 206 | ds_kwargs.update( 207 | dict(base_dir=self._repo_root, checkpoint=self._checkpoints_json) 208 | ) 209 | 210 | logger.info(f"deepspeed.init_inference kwargs: {ds_kwargs}") 211 | model = deepspeed.init_inference( 212 | model, 213 | **ds_kwargs, 214 | ) 215 | 216 | if self.torch_compile and self.torch_compile["backend"]: 217 | logger.info("Compiling the model with torch.compile()...") 218 | model = torch.compile(model, **self.torch_compile) 219 | 220 | # Add attributes for compatibility with the pipeline 221 | model.use_kernel = self.use_kernel 222 | model.device = self.device 223 | model = model.to(self.device) 224 | return model 225 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import subprocess 4 | import time 5 | import traceback 6 | from collections import defaultdict 7 | from functools import wraps 8 | from typing import List, Optional 9 | 10 | from transformers import AutoConfig 11 | import torch.distributed as dist 12 | from filelock import FileLock 13 | from ray.air.util.torch_dist import ( 14 | ActorHandle, 15 | _get_node_and_gpu_ids, 16 | _init_torch_distributed, 17 | get_address_and_port, 18 | ) 19 | from torch.hub import _get_torch_home 20 | 21 | from llmadmin.backend.logger import get_logger 22 | from llmadmin.backend.server.models import S3MirrorConfig 23 | 24 | logger = get_logger(__name__) 25 | 26 | 27 | def download_model( 28 | model_id: str, 29 | endpoint_url: str, 30 | bucket_uri: str, 31 | s3_sync_args: Optional[List[str]] = None, 32 | ) -> None: 33 | """ 34 | Download a model from an S3 bucket and save it in TRANSFORMERS_CACHE for 35 | seamless interoperability with Hugging Face's Transformers library. 36 | 37 | The downloaded model must have a 'hash' file containing the commit hash corresponding 38 | to the commit on Hugging Face Hub. 39 | """ 40 | from transformers.utils.hub import TRANSFORMERS_CACHE 41 | 42 | isAutoLoadConfigSuccess = False 43 | modelConfig = None 44 | try: 45 | modelConfig = AutoConfig.from_pretrained( 46 | model_id, trust_remote_code=True) 47 | isAutoLoadConfigSuccess = True 48 | except Exception: 49 | isAutoLoadConfigSuccess = False 50 | 51 | if modelConfig and isAutoLoadConfigSuccess: 52 | logger.info(f"Model exist and success to load AutoConfig from_pretrained '{model_id}'") 53 | return 54 | else: 55 | logger.info(f"Fail to load AutoConfig from_pretrained '{model_id}'") 56 | 57 | s3_sync_args = s3_sync_args or [] 58 | logger.info(f"Downloading '{model_id}' from '{bucket_uri}' to '{TRANSFORMERS_CACHE}'") 59 | path = os.path.expanduser(os.path.join(TRANSFORMERS_CACHE, f"models--{model_id.replace('/', '--')}")) 60 | 61 | isS3 = bucket_uri.startswith('s3://') 62 | if isS3: 63 | model_hash_file = os.path.join(bucket_uri, "hash") 64 | if endpoint_url: 65 | logger.info(f"Downloading '{model_id}' hash from server '{endpoint_url}' '{model_hash_file}' ") 66 | subprocess.run(["aws", "--endpoint-url", endpoint_url, "s3", "cp", "--quiet"] + s3_sync_args + [model_hash_file, "."]) 67 | else: 68 | logger.info(f"Downloading '{model_id}' hash from '{model_hash_file}' ") 69 | subprocess.run(["aws", "s3", "cp", "--quiet"] + s3_sync_args + [model_hash_file, "."]) 70 | else: 71 | model_hash_file = bucket_uri + "hash" 72 | logger.info(f"Downloading '{model_id}' hash from '{model_hash_file}' ") 73 | subprocess.run(["cp -rf " + model_hash_file + " ."], shell=True) 74 | 75 | if not os.path.exists(os.path.join(".", "hash")): 76 | raise RuntimeError("Hash file not found in the bucket or bucket could not have been downloaded.") 77 | 78 | with open(os.path.join(".", "hash"), "r") as f: 79 | f_hash = f.read().strip() 80 | 81 | model_cache_path = os.path.join(path, "snapshots", f_hash) 82 | 83 | model_config_file = os.path.join(model_cache_path, "config.json") 84 | if os.path.exists(model_config_file): 85 | logger.info(f"Skip download model '{model_id}' due to config '{model_config_file}' exist") 86 | return 87 | 88 | subprocess.run(["mkdir", "-p", model_cache_path]) 89 | subprocess.run(["mkdir", "-p", os.path.join(path, "refs")]) 90 | 91 | logger.info(f"Downloading '{model_id}' files from '{bucket_uri}' to '{model_cache_path}'") 92 | if isS3: 93 | if endpoint_url: 94 | subprocess.run([ "aws", "--endpoint-url", endpoint_url, "s3", "sync", "--quiet"] + s3_sync_args + [bucket_uri, model_cache_path]) 95 | else: 96 | subprocess.run([ "aws", "s3", "sync", "--quiet"] + s3_sync_args + [bucket_uri, model_cache_path]) 97 | else: 98 | subprocess.run(["cp -rf " + bucket_uri + "*" + " " + model_cache_path], shell=True) 99 | 100 | with open(os.path.join(path, "refs", "main"), "w") as f: 101 | f.write(f_hash) 102 | 103 | def timeit(func): 104 | """ 105 | Decorator to time a function. 106 | """ 107 | 108 | @wraps(func) 109 | def inner(*args, **kwargs): 110 | start_time = time.monotonic() 111 | ret = func(*args, **kwargs) 112 | time_taken = time.monotonic() - start_time 113 | logger.info(f"LLM time counting fun {func} took {time_taken} s to complete") 114 | return ret 115 | 116 | return inner 117 | 118 | 119 | def initialize_node( 120 | model_id: Optional[str] = None, 121 | s3_mirror_config: Optional[S3MirrorConfig] = None, 122 | ): 123 | """ 124 | Performn initialization for a node. 125 | 126 | Currently, that means downloading the model from the S3 bucket. 127 | """ 128 | # Create the torch cache kernels directory if it doesn't exist. 129 | # This is a workaround for a torch issue, where the kernels directory 130 | # cannot be created by torch if the parent directory doesn't exist. 131 | torch_cache_home = _get_torch_home() 132 | os.makedirs(os.path.join(torch_cache_home, "kernels"), exist_ok=True) 133 | 134 | if model_id and s3_mirror_config and s3_mirror_config.bucket_uri: 135 | lock_path = os.path.expanduser(f"~/{model_id.replace('/', '--')}.lock") 136 | try: 137 | # Timeout 0 means there will be only one attempt to acquire 138 | # the file lock. If it cannot be aquired, a TimeoutError 139 | # will be thrown. 140 | # This allows us to make sure that subsequent processes don't 141 | # duplicate work. 142 | with FileLock(lock_path, timeout=0): 143 | endpoint_url = s3_mirror_config.endpoint_url 144 | bucket_uri = s3_mirror_config.bucket_uri 145 | s3_sync_args = s3_mirror_config.s3_sync_args 146 | try: 147 | download_model(model_id, endpoint_url, bucket_uri, s3_sync_args=s3_sync_args) 148 | logger.info("Done downloading the model from bucket!") 149 | except RuntimeError: 150 | logger.warning( 151 | f"Unable to download the model from bucket. Traceback:\n {traceback.format_exc()}" 152 | ) 153 | except TimeoutError: 154 | # if the directory is already locked, then wait but do not do anything. 155 | with FileLock(lock_path, timeout=-1): 156 | pass 157 | 158 | 159 | def merge_dicts(overwrite: dict, base: dict) -> dict: 160 | """ 161 | Merge two dictionaries recursively, with keys from overwrite taking precedence. 162 | """ 163 | base = base.copy() 164 | for key, value in overwrite.items(): 165 | if isinstance(value, dict): 166 | # get node or create one 167 | node = base.setdefault(key, {}) 168 | merge_dicts(value, node) 169 | else: 170 | base[key] = value 171 | 172 | return base 173 | 174 | 175 | async def init_torch_dist_process_group_async( 176 | workers: List[ActorHandle], 177 | backend: str = "gloo", 178 | init_method: str = "env", 179 | ) -> List[int]: 180 | """Initialize a torch distributed process group asynchronously. 181 | 182 | This is identical to 183 | ``ray.air.util.torch_dist.init_torch_dist_process_group`` 184 | but uses asyncio to avoid blocking the event loop. 185 | 186 | Note: this util assumes that the order of the workers passed in 187 | are their global ranks. 188 | 189 | Args: 190 | workers: A list of TorchDistributedWorker actors. 191 | backend: The torch distributed backend to use, 192 | possible choices are "gloo" or "nccl". 193 | init_method: The initialization method to use, 194 | possible choices are "env" or "tcp". 195 | 196 | Returns: 197 | Local ranks on their respective nodes for the list of workers. 198 | """ 199 | if not dist.is_available(): 200 | raise RuntimeError("Distributed torch is not available.") 201 | 202 | # Build a map from node_id to workers on that node. 203 | node_and_gpu_ids = await asyncio.gather( 204 | *[w.execute.remote(_get_node_and_gpu_ids) for w in workers] 205 | ) 206 | # All the workers on a specific node. 207 | node_to_workers = defaultdict(list) 208 | # All the gpu ids visible to all the workers on a specific node. 209 | node_to_gpu_ids = defaultdict(set) 210 | for i, (node_id, gpu_ids) in enumerate(node_and_gpu_ids): 211 | node_to_workers[node_id].append(i) 212 | # Force list. 213 | if not isinstance(gpu_ids, list): 214 | gpu_ids = [gpu_ids] 215 | # It is possible for a worker to have access to multiple GPUs. 216 | for gpu_id in gpu_ids: 217 | node_to_gpu_ids[node_id].add(gpu_id) 218 | 219 | # Assume the first worker is the master. 220 | master_addr, master_port = ( 221 | await asyncio.gather(workers[0].execute.remote(get_address_and_port)) 222 | )[0] 223 | 224 | setup_futures = [] 225 | world_size = len(workers) 226 | local_ranks = [] 227 | for rank, worker in enumerate(workers): 228 | node_id = node_and_gpu_ids[rank][0] 229 | local_rank = node_to_workers[node_id].index(rank) 230 | local_world_size = len(node_to_workers[node_id]) 231 | setup_futures.append( 232 | worker.execute.remote( 233 | _init_torch_distributed, 234 | init_method=init_method, 235 | backend=backend, 236 | rank=rank, 237 | world_size=world_size, 238 | local_rank=local_rank, 239 | local_world_size=local_world_size, 240 | master_addr=master_addr, 241 | master_port=master_port, 242 | # list(set) will sort the gpu ids, so VISIBLE_CUDA_DEVICES 243 | # is always sorted. 244 | gpu_ids=list(node_to_gpu_ids[node_id]), 245 | ) 246 | ) 247 | local_ranks.append(local_rank) 248 | 249 | # Wait for all workers to join the process group. 250 | await asyncio.gather(*setup_futures) 251 | 252 | return local_ranks 253 | -------------------------------------------------------------------------------- /llmadmin/api/cli.py: -------------------------------------------------------------------------------- 1 | # import ast 2 | # import json 3 | from typing import Annotated, Optional 4 | 5 | import typer 6 | # from rich import print as rp 7 | # from rich.console import Console 8 | # from rich.progress import Progress, SpinnerColumn, TextColumn 9 | # from rich.table import Table 10 | from ray.serve._private.constants import DEFAULT_HTTP_PORT 11 | from llmadmin.api import sdk 12 | 13 | app = typer.Typer() 14 | 15 | model_type = typer.Option( 16 | default=..., help="The model to use. You can specify multiple models." 17 | ) 18 | 19 | ft_define = typer.Option( 20 | default=..., help="the fine tune yaml file" 21 | ) 22 | 23 | app_name = typer.Option( 24 | default=..., help="The name of ray serve application." 25 | ) 26 | host = typer.Option( 27 | default=..., help="The host ip address of ray api server." 28 | ) 29 | port = typer.Option( 30 | default=...,help="The port of api server." 31 | ) 32 | prompt_type = typer.Option(help="Prompt to query") 33 | stats_type = typer.Option(help="Whether to print generated statistics") 34 | prompt_file_type = typer.Option( 35 | default=..., help="File containing prompts. A simple text file" 36 | ) 37 | separator_type = typer.Option(help="Separator used in prompt files") 38 | results_type = typer.Option(help="Where to save the results") 39 | file_type = typer.Option(default=..., help="The flow graph") 40 | port_type = typer.Option(default=..., help="The port of service.") 41 | apiserver_scale_type = typer.Option(default=..., help="A string of dict for scaling service. for example: --scale-config=min_replicas=1,max_replicas=5") 42 | apiserver_resource_type = typer.Option(default=..., help="A string of dict for resource requirement. for example: --resource-config=num_cpus=1") 43 | 44 | @app.command() 45 | def start_apiserver( 46 | port: Annotated[Optional[int], port_type] = DEFAULT_HTTP_PORT, 47 | resource_config: Annotated[str, apiserver_resource_type] = None, 48 | scale_config: Annotated[str, apiserver_scale_type] = None 49 | ): 50 | """Start a api server, it will provide apis. 51 | Args: 52 | *host: The host ip to run. 53 | *port: The port to run. 54 | """ 55 | sdk.start_apiserver(port=port, resource_config=resource_config, scale_config=scale_config) 56 | 57 | @app.command() 58 | def run_ft(ft: Annotated[str, ft_define]): 59 | """Start a fine tune process. 60 | 61 | Args: 62 | *model: The model to run. 63 | """ 64 | sdk.run_ft(ft) 65 | 66 | @app.command() 67 | def ray_ft(model: Annotated[str, ft_define]): 68 | """Start a fine tune ray process. 69 | 70 | Args: 71 | *model: The model to run. 72 | """ 73 | sdk.run_ray_ft(model) 74 | 75 | # @app.command() 76 | # def list_models(metadata: Annotated[bool, "Whether to print metadata"] = False): 77 | # """Get a list of the available models""" 78 | # result = sdk.models() 79 | # if metadata: 80 | # for model in result: 81 | # rp(f"[bold]{model}:[/]") 82 | # rp(sdk.metadata(model)) 83 | # else: 84 | # print("\n".join(result)) 85 | 86 | 87 | # def _print_result(result, model, print_stats): 88 | # rp(f"[bold]{model}:[/]") 89 | # if print_stats: 90 | # rp("[bold]Stats:[/]") 91 | # rp(result) 92 | # else: 93 | # rp(result) 94 | 95 | 96 | # def progress_spinner(): 97 | # return Progress( 98 | # SpinnerColumn(), 99 | # TextColumn("[progress.description]{task.description}"), 100 | # transient=True, 101 | # ) 102 | 103 | 104 | # @app.command() 105 | # def query( 106 | # model: Annotated[List[str], model_type], 107 | # prompt: Annotated[Optional[List[str]], prompt_type] = None, 108 | # prompt_file: Annotated[Optional[str], prompt_file_type] = None, 109 | # separator: Annotated[str, separator_type] = "----", 110 | # output_file: Annotated[str, results_type] = "llmadmin-output.json", 111 | # print_stats: Annotated[bool, stats_type] = False, 112 | # ): 113 | # """Query one or several models with one or multiple prompts, 114 | # optionally read from file, and save the results to a file.""" 115 | # with progress_spinner() as progress: 116 | # if prompt_file: 117 | # with open(prompt_file, "r") as f: 118 | # prompt = f.read().split(separator) 119 | 120 | # results = {p: [] for p in prompt} 121 | 122 | # for m in model: 123 | # progress.add_task( 124 | # description=f"Processing all prompts against model: {m}.", 125 | # total=None, 126 | # ) 127 | # query_results = sdk.batch_query(m, prompt) 128 | # for result in query_results: 129 | # _print_result(result, m, print_stats) 130 | 131 | # for i, p in enumerate(prompt): 132 | # result = query_results[i] 133 | # text = result 134 | # # del result["generated_text"] 135 | # results[p].append({"model": m, "result": text, "stats": result}) 136 | 137 | # progress.add_task(description="Writing output file.", total=None) 138 | # with open(output_file, "w") as f: 139 | # f.write(json.dumps(results, indent=2)) 140 | 141 | 142 | # @app.command(deprecated=True, name="batch_query") 143 | # def batch_query( 144 | # model: Annotated[List[str], model_type], 145 | # prompt: Annotated[List[str], prompt_type], 146 | # print_stats: Annotated[bool, stats_type] = False, 147 | # ): 148 | # """Query a model with a batch of prompts.""" 149 | # with progress_spinner() as progress: 150 | # for m in model: 151 | # progress.add_task( 152 | # description=f"Processing prompt against {m}...", total=None 153 | # ) 154 | # results = sdk.batch_query(m, prompt) 155 | # for result in results: 156 | # _print_result(result, m, print_stats) 157 | 158 | 159 | # @app.command(deprecated=True, name="multi_query") 160 | # def multi_query( 161 | # model: Annotated[List[str], model_type], 162 | # prompt_file: Annotated[str, prompt_file_type], 163 | # separator: Annotated[str, separator_type] = "----", 164 | # output_file: Annotated[str, results_type] = "llmadmin-output.json", 165 | # ): 166 | # """Query one or multiple models with a batch of prompts taken from a file.""" 167 | 168 | # with progress_spinner() as progress: 169 | # progress.add_task( 170 | # description=f"Loading your prompts from {prompt_file}.", total=None 171 | # ) 172 | # with open(prompt_file, "r") as f: 173 | # prompts = f.read().split(separator) 174 | # results = {prompt: [] for prompt in prompts} 175 | 176 | # for m in model: 177 | # progress.add_task( 178 | # description=f"Processing all prompts against model: {model}.", 179 | # total=None, 180 | # ) 181 | # query_results = sdk.batch_query(m, prompts) 182 | # for i, prompt in enumerate(prompts): 183 | # result = query_results[i] 184 | # text = result["generated_text"] 185 | # del result["generated_text"] 186 | # results[prompt].append({"model": m, "result": text, "stats": result}) 187 | 188 | # progress.add_task(description="Writing output file.", total=None) 189 | # with open(output_file, "w") as f: 190 | # f.write(json.dumps(results, indent=2)) 191 | 192 | 193 | # evaluator_type = typer.Option(help="Which LLM to use for evaluation") 194 | 195 | 196 | # @app.command() 197 | # def run(model: Annotated[List[str], model_type]): 198 | # """Start a model. 199 | 200 | # Args: 201 | # *model: The model to run. 202 | # """ 203 | # sdk.run(*model) 204 | 205 | # @app.command() 206 | # def run_experimental(model: Annotated[List[str], model_type]): 207 | # """Start a model for experimental, it will do inference by transformer pipeline. 208 | 209 | # Args: 210 | # *model: The model to run. 211 | # """ 212 | # sdk.run_experimental(*model) 213 | 214 | # @app.command() 215 | # def del_serve(appname: Annotated[str, app_name]): 216 | # """Remove a ray serve. 217 | 218 | # Args: 219 | # *model: The model to run. 220 | # """ 221 | # sdk.del_experimental(appname) 222 | 223 | # @app.command() 224 | # def run_application(file: Annotated[str, file_type]): 225 | # """Start a model in LLMAdmin for experimental. 226 | 227 | # Args: 228 | # *model: The model to run. 229 | # """ 230 | # from pathlib import Path 231 | # # If input is a file path, load JSON from the file 232 | # if isinstance(file, (str, Path)): 233 | # with open(file, "r", encoding="utf-8") as f: 234 | # flow_graph = json.load(f) 235 | # else: 236 | # raise TypeError( 237 | # "Input must be a file path (str)" 238 | # ) 239 | # sdk.run_application(flow_graph) 240 | 241 | 242 | 243 | # @app.command() 244 | # def run_comparation(): 245 | # """Start frontend for model comparation. 246 | 247 | # Args: 248 | # *model: The model to run. 249 | # """ 250 | # sdk.run_comparation() 251 | 252 | # @app.command() 253 | # def evaluate( 254 | # input_file: Annotated[str, results_type] = "llmadmin-output.json", 255 | # evaluation_file: Annotated[str, results_type] = "evaluation-output.json", 256 | # evaluator: Annotated[str, evaluator_type] = "gpt-4", 257 | # ): 258 | # """Evaluate and summarize the results of a multi_query run with a strong 259 | # 'evaluator' LLM like GPT-4. 260 | # The results of the ranking are stored to file and displayed in a table. 261 | # """ 262 | # with progress_spinner() as progress: 263 | # progress.add_task(description="Loading the evaluator LLM.", total=None) 264 | # if evaluator == "gpt-4": 265 | # from llmadmin.common.evaluation import GPT 266 | 267 | # eval_model = GPT() 268 | # else: 269 | # raise NotImplementedError(f"No evaluator for {evaluator}") 270 | 271 | # with open(input_file, "r") as f: 272 | # results = json.load(f) 273 | 274 | # for prompt, result_list in results.items(): 275 | # progress.add_task( 276 | # description=f"Evaluating results for prompt: {prompt}.", total=None 277 | # ) 278 | # evaluation = eval_model.evaluate_results(prompt, result_list) 279 | # try: 280 | # # GPT-4 returns a string with a Python dictionary, hopefully! 281 | # evaluation = ast.literal_eval(evaluation) 282 | # except Exception: 283 | # print(f"Could not parse evaluation: {evaluation}") 284 | 285 | # for i, _res in enumerate(results[prompt]): 286 | # results[prompt][i]["rank"] = evaluation[i]["rank"] 287 | 288 | # progress.add_task(description="Storing evaluations.", total=None) 289 | # with open(evaluation_file, "w") as f: 290 | # f.write(json.dumps(results, indent=2)) 291 | 292 | # for prompt in results.keys(): 293 | # table = Table(title="Evaluation results (higher ranks are better)") 294 | 295 | # table.add_column("Model", justify="left", style="cyan", no_wrap=True) 296 | # table.add_column("Rank", style="magenta") 297 | # table.add_column("Response", justify="right", style="green") 298 | 299 | # for i, _res in enumerate(results[prompt]): 300 | # model = results[prompt][i]["model"] 301 | # response = results[prompt][i]["result"] 302 | # rank = results[prompt][i]["rank"] 303 | # table.add_row(model, str(rank), response) 304 | 305 | # console = Console() 306 | # console.print(table) 307 | 308 | 309 | if __name__ == "__main__": 310 | app() 311 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2023 Anyscale 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /llmadmin/backend/llm/ft/transformer.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseFT 2 | from abc import ABC, abstractmethod 3 | from llmadmin.backend.logger import get_logger 4 | # from datasets import DatasetDict, Dataset, IterableDatasetDict, IterableDataset 5 | # from typing import Union 6 | from llmadmin.backend.server.models import FTApp 7 | from datasets import load_dataset 8 | from datasets import load_metric 9 | import pandas as pd 10 | # from ray.data.preprocessors import BatchMapper 11 | # import ray 12 | import torch 13 | from transformers import TrainingArguments, Trainer 14 | # import numpy as np 15 | # from ray.train.huggingface import TransformersTrainer 16 | # from ray.air.config import RunConfig, CheckpointConfig 17 | from .utils import parse_task_name 18 | from .tasks import TASK_REGISTRY 19 | from .tasks._base import Task 20 | from .methods.base import get_train_model 21 | # from ray.train.huggingface import TransformersCheckpoint 22 | from .const import CHECKPOINT_PATH 23 | from .callback import CustomCallback 24 | 25 | from llmadmin.backend.llm.utils import initialize_node 26 | 27 | logger = get_logger(__name__) 28 | 29 | class TransformersFT(BaseFT): 30 | def __init__(self, ftApp: FTApp): 31 | super().__init__(ftapp=ftApp) 32 | 33 | def train(self): 34 | self.trainV2() 35 | 36 | # Transformer train only 37 | def trainV2(self): 38 | taskobj: Task = None 39 | task = parse_task_name(self.ftapp) 40 | logger.info(f"TransformersFT.trainV2 finetune task name: '{task}'") 41 | taskcls = TASK_REGISTRY[task] 42 | 43 | if not taskcls: 44 | logger.error(f"Couldn't load defined task from register: '{task}'") 45 | raise 46 | 47 | logger.info("Start initializing finetune node tasks") 48 | initialize_node(self.model_config.model_id, self.model_config.initialization.s3_mirror_config) 49 | logger.info(f"Start loading tokenizer for finetune {self.model_config.model_id}") 50 | # self.model_config.model_id = '/root/.cache/huggingface/hub/ZhipuAI/chatglm3-6b/' 51 | # self.model_config.model_id = '/data/hhwang/models/chatglm2-6b/' 52 | tokenizer = self.initializer.load_tokenizer(self.model_config.model_id) 53 | if self.model_config.add_special_tokens: 54 | add_special_tokens = self.model_config.add_special_tokens 55 | if add_special_tokens.get("pad_token"): 56 | tokenizer.pad_token = add_special_tokens.get("pad_token") 57 | if add_special_tokens.get("eos_token"): 58 | tokenizer.eos_token = add_special_tokens.get("eos_token") 59 | logger.info(f"Initialize {taskcls} and load dataset") 60 | # logger.info(f"Initialize {taskcls} and load dataset for model {self.model_config.model_id}") 61 | taskobj = taskcls.from_tokenizer(tokenizer, self.ftapp.ft_config) 62 | logger.info(f"Load model {self.model_config.model_id} by {taskobj.AUTO_MODEL_CLASS}") 63 | from_pretrained_kwargs = taskobj.FROM_PRETRAINED_KWARGS if taskobj.FROM_PRETRAINED_KWARGS else {} 64 | model = self.initializer.load_model(self.model_config.model_id, taskobj.AUTO_MODEL_CLASS, **from_pretrained_kwargs) 65 | if self.model_config.quantization_bit is not None: 66 | print(f"Quantized to {self.model_config.quantization_bit} bit") 67 | model = model.quantize(self.model_config.quantization_bit) 68 | 69 | taskobj.set_model(model) 70 | 71 | # preprocess_function = taskobj.get_data_proprocess() 72 | # compute_metrics_function = taskobj.get_compute_metrics() 73 | data_collator = taskobj.get_data_collator() 74 | # batch_encoder = BatchMapper(preprocess_function, batch_format="pandas") 75 | 76 | data_config = self.ftapp.ft_config.data_config 77 | use_gpu = True if torch.cuda.is_available() else False 78 | use_mps = True if torch.backends.mps.is_available() else False 79 | logger.info(f"use_gpu: {use_gpu}, use_cpu: {not use_gpu}, use_mps: {use_mps}") 80 | 81 | logger.info(f"Finetune get train and validation dataset") 82 | if data_config.num_row > 0: 83 | # only for test purpose 84 | train_dataset = taskobj.getSmallTrainDataSet(data_config.num_row) 85 | eval_dataset = taskobj.getSmallEvalDataSet(data_config.num_row) 86 | else: 87 | # For train 88 | train_dataset = taskobj.getTrainDataSet() 89 | eval_dataset = taskobj.getEvalDataSet() 90 | 91 | logger.info(f"Finetune train dataset {train_dataset}") 92 | logger.info(f"Finetune eval dataset {eval_dataset}") 93 | 94 | if hasattr(model, "is_parallelizable"): 95 | logger.info(f"model.is_parallelizable = {model.is_parallelizable}") 96 | 97 | if hasattr(model, "model_parallel"): 98 | logger.info(f"model.model_parallel = {model.model_parallel}") 99 | 100 | if getattr(model, "hf_device_map", None) is not None: 101 | logger.info(f"model.hf_device_map is {model.hf_device_map}") 102 | 103 | ftConfig = self.ftapp.ft_config.train_config.base_config 104 | model_name = self.model_config.model_id.split("/")[-1] 105 | task_name = self.ft_task 106 | outputDir = f"{ftConfig.checkpoints_output_dir}/{model_name}-finetuned-{task_name}-{data_config.data_path}-{data_config.subset}" 107 | logger.info(f"Finetune checkpoints output dir: {outputDir}") 108 | args = TrainingArguments( 109 | outputDir, 110 | evaluation_strategy=ftConfig.evaluation_strategy, 111 | save_strategy=ftConfig.save_strategy, 112 | logging_strategy=ftConfig.logging_strategy, 113 | logging_steps = 2, 114 | save_steps = ftConfig.save_steps, 115 | eval_steps = 2, 116 | learning_rate=ftConfig.learning_rate, 117 | per_device_train_batch_size=ftConfig.per_device_train_batch_size, 118 | per_device_eval_batch_size=ftConfig.per_device_eval_batch_size, 119 | num_train_epochs=ftConfig.num_train_epochs, 120 | weight_decay=ftConfig.weight_decay, 121 | push_to_hub=False, 122 | disable_tqdm=False, # declutter the output a little 123 | use_cpu=not use_gpu, # you need to explicitly set no_cuda if you want CPUs 124 | remove_unused_columns=ftConfig.remove_unused_columns, 125 | ) 126 | trainConfig = self.ftapp.ft_config.train_config 127 | ftMethod = self.ftapp.ft_config.ft_method 128 | model = get_train_model(model, ftMethod, trainConfig) 129 | trainer = Trainer( 130 | # trainer = Seq2SeqTrainer( 131 | model, 132 | args, 133 | train_dataset=train_dataset, 134 | eval_dataset=eval_dataset, 135 | tokenizer=tokenizer, 136 | # compute_metrics=compute_metrics_function, 137 | data_collator=data_collator, 138 | ) 139 | trainer.add_callback(CustomCallback(trainer)) 140 | logger.info("Starting training") 141 | trainResult = trainer.train() 142 | logger.info(f"Train result {trainResult}") 143 | trainer.save_model() 144 | logger.info(f"Save model to {trainer.args.output_dir}") 145 | logger.info("Done training") 146 | 147 | # depend on ray for distribution 148 | # def trainV1(self): 149 | # taskobj: Task = None 150 | # task = parse_task_name(self.ftapp) 151 | # logger.info(f"TransformersFT.trainV1 finetune task name {task}") 152 | # taskcls = TASK_REGISTRY[task] 153 | 154 | # if not taskcls: 155 | # logger.error(f"Couldn't load defined task from register: {task}") 156 | # raise 157 | 158 | # logger.info("Starting initialize Finetune node tasks") 159 | # initialize_node(self.model_config.model_id, self.model_config.initialization.s3_mirror_config) 160 | 161 | # tokenizer = self.initializer.load_tokenizer(self.model_config.model_id) 162 | # logger.info("Done load tokenizer for finetune") 163 | 164 | # taskobj = taskcls.from_tokenizer(tokenizer, self.ftapp.ft_config) 165 | 166 | # from_pretrained_kwargs = taskobj.FROM_PRETRAINED_KWARGS if taskobj.FROM_PRETRAINED_KWARGS else {} 167 | # model = self.initializer.load_model(self.model_config.model_id, taskobj.AUTO_MODEL_CLASS, **from_pretrained_kwargs) 168 | # taskobj.set_model(model) 169 | 170 | # preprocess_function = taskobj.get_data_proprocess() 171 | # compute_metrics_function = taskobj.get_compute_metrics() 172 | # data_collator = taskobj.get_data_collator() 173 | # batch_encoder = BatchMapper(preprocess_function, batch_format="pandas") 174 | 175 | # ray_datasets = ray.data.from_huggingface(taskobj.get_dataset()) 176 | # model_name = self.model_config.model_id.split("/")[-1] 177 | # task = self.ft_task 178 | # name = f"{model_name}-finetuned-{task}" 179 | # use_gpu = True if torch.cuda.is_available() else False 180 | 181 | # def trainer_init_per_worker(train_dataset, eval_dataset = None, **config): 182 | # print(f"Is CUDA available: {torch.cuda.is_available()}") 183 | 184 | # args = TrainingArguments( 185 | # name, 186 | # evaluation_strategy=config.get("evaluation_strategy", "epoch"), 187 | # save_strategy=config.get("save_strategy", "epoch"), 188 | # logging_strategy=config.get("logging_strategy", "epoch"), 189 | # logging_steps = 2, 190 | # save_steps = 500, 191 | # eval_steps = 2, 192 | # learning_rate=config.get("learning_rate", 2e-5), 193 | # per_device_train_batch_size=config.get("per_device_train_batch_size", 16), 194 | # per_device_eval_batch_size=config.get("per_device_train_batch_size", 16), 195 | # num_train_epochs=config.get("epochs", 2), 196 | # weight_decay=config.get("weight_decay", 0.01), 197 | # push_to_hub=False, 198 | # disable_tqdm=False, # declutter the output a little 199 | # no_cuda=not use_gpu, # you need to explicitly set no_cuda if you want CPUs 200 | # remove_unused_columns=config.get("remove_unused_columns", True), 201 | # fp16=True, 202 | # ) 203 | 204 | # trainer = Trainer( 205 | # model, 206 | # args, 207 | # train_dataset=train_dataset, 208 | # eval_dataset=eval_dataset, 209 | # tokenizer=tokenizer, 210 | # compute_metrics=compute_metrics_function, 211 | # data_collator=data_collator, 212 | # ) 213 | # trainer.add_callback(CustomCallback(trainer)) 214 | # print("Starting training") 215 | 216 | # return trainer 217 | 218 | # trainer = TransformersTrainer( 219 | # trainer_init_per_worker=trainer_init_per_worker, 220 | # trainer_init_config = self.train_conf.get_train_kwargs(), 221 | # scaling_config=self.scale_config.as_air_scaling_config(), 222 | # datasets={ 223 | # "train": ray_datasets[taskobj.training_key()], 224 | # "evaluation": ray_datasets[taskobj.validation_key()], 225 | # }, 226 | # run_config=RunConfig( 227 | # # callbacks=[MLflowLoggerCallback(experiment_name=name)], 228 | # checkpoint_config=CheckpointConfig( 229 | # num_to_keep=1, 230 | # checkpoint_score_attribute="eval_loss", 231 | # checkpoint_score_order="min", 232 | # ), 233 | # ), 234 | # preprocessor=batch_encoder, 235 | # ) 236 | 237 | # result = trainer.fit() 238 | # print(result) 239 | # checkpoint = TransformersCheckpoint.from_checkpoint(result.checkpoint) 240 | # hf_trainer = checkpoint.get_model(model=taskobj.AUTO_MODEL_CLASS) 241 | # hf_trainer.save_pretrained(CHECKPOINT_PATH) 242 | # tokenizer.save_pretrained(CHECKPOINT_PATH) 243 | 244 | # print("Done") 245 | 246 | 247 | 248 | 249 | --------------------------------------------------------------------------------