├── llmadmin
    ├── api
    │   ├── __init__.py
    │   ├── env.py
    │   ├── sdk.py
    │   └── cli.py
    ├── common
    │   ├── __init__.py
    │   ├── llm_event.py
    │   ├── evaluation.py
    │   └── backend.py
    ├── backend
    │   ├── llm
    │   │   ├── __init__.py
    │   │   ├── ft
    │   │   │   ├── const.py
    │   │   │   ├── methods
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   └── lora.py
    │   │   │   ├── __init__.py
    │   │   │   ├── utils.py
    │   │   │   ├── tasks
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── sequenceclassification_glue_cola.py
    │   │   │   │   ├── sequenceclassification_glue_mrpc.py
    │   │   │   │   ├── sequenceclassification_yelp_review_full.py
    │   │   │   │   ├── _base.py
    │   │   │   │   ├── maskedlm_imdb.py
    │   │   │   │   ├── tokenclassification_conll2003.py
    │   │   │   │   ├── noheader_AdvertiseGen.py
    │   │   │   │   └── text_generation_AdvertiseGen.py
    │   │   │   ├── callback.py
    │   │   │   ├── _base.py
    │   │   │   ├── test
    │   │   │   │   └── test_seq_cls_bert_yelp.py
    │   │   │   ├── ray_train.py
    │   │   │   └── transformer.py
    │   │   ├── pipelines
    │   │   │   ├── llamacpp
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── processors.py
    │   │   │   │   └── llamacpp_pipeline.py
    │   │   │   ├── __init__.py
    │   │   │   ├── processors.py
    │   │   │   ├── utils.py
    │   │   │   ├── default_pipeline.py
    │   │   │   └── default_transformers_pipeline.py
    │   │   ├── initializers
    │   │   │   ├── hf_transformers
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── deepspeed.py
    │   │   │   ├── __init__.py
    │   │   │   ├── _base.py
    │   │   │   └── llamacpp.py
    │   │   └── utils.py
    │   ├── server
    │   │   ├── __init__.py
    │   │   ├── exceptions.py
    │   │   ├── _batch.py
    │   │   ├── run.py
    │   │   └── config.py
    │   └── logger.py
    ├── frontend
    │   ├── __init__.py
    │   ├── mongo_secrets.py
    │   ├── javascript_loader.py
    │   ├── app.py
    │   ├── javascript
    │   │   └── llmadmin.js
    │   ├── utils.py
    │   ├── mongo_logger.py
    │   └── leaderboard.py
    └── __init__.py
├── docs
    └── llm-finetune.png
├── MANIFEST.in
├── pyproject.toml
├── dataset
    └── glue
    │   └── mrpc
    │       └── 1.0.0
    │           ├── test-00000-of-00001.parquet
    │           ├── train-00000-of-00001.parquet
    │           └── validation-00000-of-00001.parquet
├── llm_finetune.py
├── requirements.txt
├── models
    ├── ft--sequenceclassification--bert-base-uncased.yaml
    ├── ft--text-generation--Qwen-Qwen-7B-Chat.yaml
    ├── ft--text-generation--THUDM-chatglm2-6b.yaml
    ├── ft--text-generation--Qwen-Qwen-7B.yaml
    ├── ft--sequenceclassification--bert-base-uncased-lora.yaml
    ├── ft--maskedlm--distilbert-base-uncased.yaml
    └── ft--text-generation--THUDM-chatglm3-6b.yaml
├── setup.py
├── llm_finetune_ray.py
├── .gitignore
├── README.md
└── LICENSE


/llmadmin/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmadmin/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmadmin/frontend/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmadmin/backend/server/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llmadmin/__init__.py:
--------------------------------------------------------------------------------
1 | from llmadmin.api.sdk import *
2 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/const.py:
--------------------------------------------------------------------------------
1 | CHECKPOINT_PATH = "./fintuned/"


--------------------------------------------------------------------------------
/llmadmin/backend/server/exceptions.py:
--------------------------------------------------------------------------------
1 | class PromptTooLongError(ValueError):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/docs/llm-finetune.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/docs/llm-finetune.png


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE *.sh
2 | recursive-include tests *.py
3 | recursive-include models *.yaml
4 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff]
2 | select = ["E", "F", "I", "ASYNC", "B"]
3 | line-length = 300
4 | ignore = ["F403", "B905"]


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/methods/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import get_train_model
2 | 
3 | __all__ = [
4 |     "get_train_model"
5 | ]
6 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/llamacpp/__init__.py:
--------------------------------------------------------------------------------
1 | from .llamacpp_pipeline import LlamaCppPipeline
2 | 
3 | __all__ = ["LlamaCppPipeline"]


--------------------------------------------------------------------------------
/dataset/glue/mrpc/1.0.0/test-00000-of-00001.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/dataset/glue/mrpc/1.0.0/test-00000-of-00001.parquet


--------------------------------------------------------------------------------
/dataset/glue/mrpc/1.0.0/train-00000-of-00001.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/dataset/glue/mrpc/1.0.0/train-00000-of-00001.parquet


--------------------------------------------------------------------------------
/dataset/glue/mrpc/1.0.0/validation-00000-of-00001.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCSGs/llm-finetune/HEAD/dataset/glue/mrpc/1.0.0/validation-00000-of-00001.parquet


--------------------------------------------------------------------------------
/llm_finetune.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 | from llmadmin.api.cli import app
4 | if __name__ == '__main__':
5 |     sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
6 |     sys.exit(app())
7 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | from ._base import BaseFT
 4 | from .transformer import TransformersFT
 5 | from .ray_train import RayTrain
 6 | 
 7 | 
 8 | __all__ = [
 9 |     "TransformersFT", "RayTrain"
10 | ]
11 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/methods/base.py:
--------------------------------------------------------------------------------
 1 | from .lora import lora_model
 2 | from llmadmin.backend.logger import get_logger
 3 | 
 4 | logger = get_logger(__name__)
 5 | 
 6 | def get_train_model(model, ft_method, trainConfig):
 7 |     if ft_method == "lora":
 8 |         lora_config = trainConfig.lora_config
 9 |         model = lora_model(model, lora_config)
10 |     return model
11 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/utils.py:
--------------------------------------------------------------------------------
1 | from llmadmin.backend.server.models import FTApp
2 | 
3 | def parse_task_name(ftapp: FTApp):
4 |     task_purpose = (ftapp.ft_config.ft_task + "-") if ftapp.ft_config.ft_task else ""
5 |     data_path = ftapp.ft_config.data_config.data_path
6 |     data_name = ("-" + ftapp.ft_config.data_config.subset) if ftapp.ft_config.data_config.subset else ""
7 | 
8 |     return task_purpose + data_path + data_name
9 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/initializers/hf_transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import DeviceMapInitializer, SingleDeviceInitializer, TransformersInitializer, FinetuneInitializer, AutoModelInitializer, TransformersPipelineInitializer
 2 | 
 3 | __all__ = [
 4 |     "DeviceMapInitializer",
 5 |     "SingleDeviceInitializer",
 6 |     "TransformersInitializer",
 7 |     "FinetuneInitializer",
 8 |     "TransformersPipelineInitializer",
 9 |     "AutoModelInitializer",
10 | ]
11 | 


--------------------------------------------------------------------------------
/llmadmin/backend/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from typing import Optional
 4 | 
 5 | LOG_FORMAT = (
 6 |     "[%(levelname)s %(asctime)s]{rank} %(filename)s: %(lineno)d  " "%(message)s"
 7 | )
 8 | 
 9 | 
10 | def get_logger(name: str = None, rank: Optional[int] = None, **kwargs):
11 |     if rank is None:
12 |         rank = int(os.environ.get("RANK", -1))
13 |     logger = logging.getLogger(name)
14 |     level = logging.ERROR if rank > 0 else logging.INFO
15 |     log_format = LOG_FORMAT.format(rank=f"[Rank {rank}]" if rank > -1 else "")
16 |     logging.basicConfig(level=level, format=log_format, **kwargs)
17 |     return logger
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | async_timeout==4.0.3
 2 | boto3==1.34.54
 3 | datasets==2.18.0
 4 | evaluate==0.4.1
 5 | fastapi==0.100.1
 6 | filelock==3.13.1
 7 | gradio==3.39.0
 8 | huggingface_hub==0.21.3
 9 | jieba==0.42.1
10 | mdit_py_plugins==0.3.3
11 | nltk==3.8.1
12 | numpy==1.26.4
13 | optimum==1.17.1
14 | pandas==2.2.1
15 | peft==0.9.0
16 | pydantic==1.10.9
17 | pymongo==4.6.2
18 | PyYAML==6.0.1
19 | Requests==2.31.0
20 | rich==13.7.1
21 | rouge_chinese==1.0.3
22 | torch==2.1.2
23 | transformers==4.33.0
24 | typer==0.9.0
25 | typing_extensions==4.10.0
26 | socksio==1.0.0
27 | scipy==1.11.1
28 | einops
29 | transformers_stream_generator
30 | tiktoken
31 | cpm_kernels
32 | ray[serve]==2.20.0
33 | ray[train]==2.20.0
34 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | from ._base import BasePipeline
 4 | from .default_pipeline import DefaultPipeline
 5 | from .default_transformers_pipeline import DefaultTransformersPipeline
 6 | from .llamacpp import LlamaCppPipeline
 7 | 
 8 | 
 9 | def get_pipeline_cls_by_name(name: str) -> Type[BasePipeline]:
10 |     lowercase_globals = {k.lower(): v for k, v in globals().items()}
11 |     ret = lowercase_globals.get(
12 |         f"{name.lower()}pipeline", lowercase_globals.get(name.lower(), None)
13 |     )
14 |     assert ret
15 |     return ret
16 | 
17 | 
18 | __all__ = [
19 |     "get_pipeline_cls_by_name",
20 |     "DefaultPipeline",
21 |     "DefaultTransformersPipeline",
22 |     "LlamaCppPipeline",
23 | ]
24 | 


--------------------------------------------------------------------------------
/llmadmin/api/env.py:
--------------------------------------------------------------------------------
 1 | def has_ray():
 2 |     try:
 3 |         import ray  # noqa: F401
 4 | 
 5 |         return True
 6 |     except ImportError:
 7 |         return False
 8 | 
 9 | 
10 | def has_backend():
11 |     try:
12 |         import llmadmin.backend  # noqa: F401
13 | 
14 |         return True
15 |     except ImportError:
16 |         return True
17 | 
18 | 
19 | def assert_has_ray():
20 |     assert has_ray(), (
21 |         "This command requires ray to be installed. "
22 |         "Please install ray with `pip install 'ray[default]'`"
23 |     )
24 | 
25 | 
26 | def assert_has_backend():
27 |     assert has_backend(), (
28 |         "This command requires llmadmin backend to be installed. "
29 |         "Please install backend dependencies with `pip install llmadmin[backend]`. "
30 |     )
31 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/llamacpp/processors.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import torch
 4 | from llama_cpp import LogitsProcessor, StoppingCriteria
 5 | from transformers import MaxTimeCriteria, MinNewTokensLengthLogitsProcessor
 6 | 
 7 | from llmadmin.backend.logger import get_logger
 8 | 
 9 | logger = get_logger(__name__)
10 | 
11 | 
12 | class LlamaCppMinNewTokensLengthLogitsProcessor(
13 |     MinNewTokensLengthLogitsProcessor, LogitsProcessor
14 | ):
15 |     def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]:
16 |         scores = MinNewTokensLengthLogitsProcessor.__call__(
17 |             self, torch.LongTensor(input_ids), torch.FloatTensor(scores)[None, :]
18 |         )
19 |         return scores[0].tolist()
20 | 
21 | 
22 | class LlamaMaxTimeCriteria(MaxTimeCriteria, StoppingCriteria):
23 |     pass


--------------------------------------------------------------------------------
/llmadmin/backend/llm/initializers/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Type
 2 | 
 3 | from .hf_transformers import (
 4 |     DeviceMapInitializer,
 5 |     SingleDeviceInitializer,
 6 |     FinetuneInitializer,
 7 |     TransformersPipelineInitializer,
 8 |     AutoModelInitializer,
 9 | )
10 | 
11 | if TYPE_CHECKING:
12 |     from ._base import LLMInitializer
13 | 
14 | from .llamacpp import LlamaCppInitializer
15 | 
16 | 
17 | def get_initializer_cls_by_name(name: str) -> Type["LLMInitializer"]:
18 |     lowercase_globals = {k.lower(): v for k, v in globals().items()}
19 |     ret = lowercase_globals.get(
20 |         f"{name.lower()}initializer", lowercase_globals.get(name.lower(), None)
21 |     )
22 |     assert ret
23 |     return ret
24 | 
25 | 
26 | __all__ = [
27 |     "get_initializer_cls_by_name",
28 |     "DeviceMapInitializer",
29 |     "SingleDeviceInitializer",
30 |     "FinetuneInitializer",
31 |     "AutoModelInitializer",
32 |     "LlamaCppInitializer",
33 |     "TransformersPipelineInitializer",
34 | ]
35 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import sequenceclassification_glue_cola
 2 | from . import sequenceclassification_glue_mrpc
 3 | from . import tokenclassification_conll2003
 4 | from . import noheader_AdvertiseGen
 5 | from . import text_generation_AdvertiseGen
 6 | from . import maskedlm_imdb
 7 | from . import sequenceclassification_yelp_review_full
 8 | 
 9 | TASK_REGISTRY = {
10 |     "sequenceclassification-glue-cola": sequenceclassification_glue_cola.SequenceclassificationGlueCola,
11 |     "sequenceclassification-glue-mrpc": sequenceclassification_glue_mrpc.SequenceclassificationGlueMrpc,
12 |     "tokenclassification-conll2003": tokenclassification_conll2003.TokenclassificationConll2003,
13 |     "noheader-AdvertiseGen": noheader_AdvertiseGen.NoheaderAdvertiseGen,
14 |     "text-generation-AdvertiseGen": text_generation_AdvertiseGen.NoheaderAdvertiseGen,
15 |     "maskedlm-imdb": maskedlm_imdb.MaskedLMImdb,
16 |     "sequenceclassification-yelp_review_full": sequenceclassification_yelp_review_full.SequenceclassificationYelpReviewFull
17 | }


--------------------------------------------------------------------------------
/models/ft--sequenceclassification--bert-base-uncased.yaml:
--------------------------------------------------------------------------------
 1 | model_config:
 2 |   warmup: True
 3 |   model_task: fill-mask
 4 |   model_id: bert-base-uncased
 5 |   max_input_words: 800
 6 |   initialization:
 7 |     initializer:
 8 |       type: Finetune
 9 |       dtype: float32
10 |       from_pretrained_kwargs:
11 |         trust_remote_code: true
12 | ft_config:
13 |   ft_task: "sequenceclassification"
14 |   data_config:
15 |     data_path: glue
16 |     subset: mrpc
17 |     local_path: dataset/glue/mrpc/1.0.0
18 |     num_row: 30     # 0: Train with all data.  >0: Test with $num_row data
19 |     # train_file:
20 |     # validation_file:
21 |     input_columns: 
22 |       - "sentence"
23 |     validation_column: validation
24 |     # labels
25 |   train_config:
26 |     base_config:
27 |       checkpoints_output_dir: finetune_models/
28 |       per_device_train_batch_size: 8
29 |       learning_rate: 2e-5
30 |       num_train_epochs: 2
31 |       weight_decay: 0.01
32 |       logging_strategy: steps
33 |       evaluation_strategy: steps
34 |       save_strategy: steps
35 |       save_steps: 100
36 | 


--------------------------------------------------------------------------------
/models/ft--text-generation--Qwen-Qwen-7B-Chat.yaml:
--------------------------------------------------------------------------------
 1 | model_config:
 2 |   warmup: True
 3 |   model_task: text-generation
 4 |   model_id: Qwen/Qwen-7B-Chat
 5 |   max_input_words: 800
 6 |   initialization:
 7 |     initializer:
 8 |       type: Finetune
 9 |       dtype: float32
10 |       from_pretrained_kwargs:
11 |         trust_remote_code: true
12 | ft_config:
13 |   ft_task: "text-generation"
14 |   data_config:
15 |     data_path: AdvertiseGen
16 |     local_path: dataset/AdvertiseGen
17 |     num_row: 30     # 0: Train with all data.  >0: Test with $num_row data
18 |     input_columns: 
19 |       - "content"
20 |     validation_column: summary
21 |   train_config:
22 |     base_config:
23 |       max_length: 500
24 |       checkpoints_output_dir: /tmp/finetune
25 |       per_device_train_batch_size: 1
26 |       per_device_eval_batch_size: 1
27 |       learning_rate: 2e-5
28 |       num_train_epochs: 2
29 |       weight_decay: 0.01
30 |       remove_unused_columns: true
31 |       logging_strategy: steps
32 |       evaluation_strategy: steps
33 |       save_strategy: steps
34 |       save_steps: 25
35 |       max_steps: 50
36 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/callback.py:
--------------------------------------------------------------------------------
 1 | from transformers import TrainerCallback, TrainerState, TrainerControl, TrainingArguments
 2 | import threading
 3 | import queue
 4 | 
 5 | QUEUE = queue.Queue()
 6 | def send_metrics():
 7 |     while True:
 8 |         item  = QUEUE.get()
 9 |         print("============")
10 |         print(item)
11 |         QUEUE.task_done()
12 | 
13 | threading.Thread(target=send_metrics, daemon=True).start()
14 | 
15 | class CustomCallback(TrainerCallback):
16 |     """
17 |     Overriding the trainer callback to be able to compute training accuracy as well
18 |     Example taken from:
19 |     https://stackoverflow.com/questions/67457480/how-to-get-the-accuracy-per-epoch-or-step-for-the-huggingface-transformers-train
20 |     """
21 |     METRICS_FILE = "./metrics"
22 | 
23 |     def __init__(self, trainer) -> None:
24 |         super().__init__()
25 |         self._trainer = trainer
26 | 
27 |     def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
28 |         if control.should_log:
29 |             if len(state.log_history) != 0:
30 |                 QUEUE.put(state.log_history[-1])
31 |         return control


--------------------------------------------------------------------------------
/models/ft--text-generation--THUDM-chatglm2-6b.yaml:
--------------------------------------------------------------------------------
 1 | model_config:
 2 |   warmup: True
 3 |   model_task: text-generation
 4 |   model_id: THUDM/chatglm2-6b
 5 |   max_input_words: 800
 6 |   quantization_bit: 4
 7 |   initialization:
 8 |     initializer:
 9 |       type: Finetune
10 |       dtype: float32
11 |       from_pretrained_kwargs:
12 |         trust_remote_code: true
13 |         # load_in_8bit: True
14 | ft_config:
15 |   ft_task: "text-generation"
16 |   data_config:
17 |     data_path: AdvertiseGen
18 |     local_path: dataset/AdvertiseGen
19 |     num_row: 30     # 0: Train with all data.  >0: Test with $num_row data
20 |     input_columns: 
21 |       - "content"
22 |     validation_column: summary 
23 |   train_config:
24 |     base_config:
25 |       max_length: 500
26 |       checkpoints_output_dir: /tmp/finetune
27 |       per_device_train_batch_size: 1
28 |       per_device_eval_batch_size: 1
29 |       learning_rate: 2e-5
30 |       num_train_epochs: 2
31 |       weight_decay: 0.01
32 |       remove_unused_columns: true
33 |       logging_strategy: steps
34 |       evaluation_strategy: steps
35 |       save_strategy: steps
36 |       save_steps: 25
37 |       max_steps: 50
38 | 


--------------------------------------------------------------------------------
/models/ft--text-generation--Qwen-Qwen-7B.yaml:
--------------------------------------------------------------------------------
 1 | model_config:
 2 |   warmup: True
 3 |   model_task: text-generation
 4 |   model_id: Qwen/Qwen-7B
 5 |   max_input_words: 800
 6 |   initialization:
 7 |     initializer:
 8 |       type: Finetune
 9 |       dtype: float32
10 |       from_pretrained_kwargs:
11 |         trust_remote_code: true
12 |   add_special_tokens:
13 |     pad_token: "<|extra_0|>"
14 |     eos_token: "<|endoftext|>"
15 | ft_config:
16 |   ft_task: "text-generation"
17 |   data_config:
18 |     data_path: AdvertiseGen
19 |     local_path: dataset/AdvertiseGen
20 |     num_row: 30     # 0: Train with all data.  >0: Test with $num_row data
21 |     input_columns: 
22 |       - "content"
23 |     validation_column: summary
24 |   train_config:
25 |     base_config:
26 |       max_length: 500
27 |       checkpoints_output_dir: /tmp/finetune
28 |       per_device_train_batch_size: 1
29 |       per_device_eval_batch_size: 1
30 |       learning_rate: 2e-5
31 |       num_train_epochs: 2
32 |       weight_decay: 0.01
33 |       remove_unused_columns: true
34 |       logging_strategy: steps
35 |       evaluation_strategy: steps
36 |       save_strategy: steps
37 |       save_steps: 25
38 |       max_steps: 50
39 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/methods/lora.py:
--------------------------------------------------------------------------------
 1 | from peft import get_peft_model
 2 | from llmadmin.backend.logger import get_logger
 3 | 
 4 | logger = get_logger(__name__)
 5 | 
 6 | def get_trainable_parameters(model):
 7 |     """
 8 |     get the number of trainable parameters in the model.
 9 |     """
10 |     trainable_params = 0
11 |     all_param = 0
12 |     for _, param in model.named_parameters():
13 |         all_param += param.numel()
14 |         if param.requires_grad:
15 |             trainable_params += param.numel()
16 |     logger.info(
17 |         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
18 |     )
19 | 
20 | def lora_model(model, lora_config):
21 |     logger.info("Load lora config")
22 |     logger.info(lora_config)
23 |     # from peft import LoraConfig, TaskType
24 |     # lora_config = LoraConfig(
25 |     #     task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
26 |     # )
27 |     # logger.info(lora_config)
28 |     lora_config.loftq_config = {}
29 |     logger.info("Using peft to avoid Catastrophic Forgetting")
30 |     model = get_peft_model(model, lora_config)
31 |     get_trainable_parameters(model)
32 |     return model
33 | 


--------------------------------------------------------------------------------
/llmadmin/frontend/mongo_secrets.py:
--------------------------------------------------------------------------------
 1 | # Use this code snippet in your app.
 2 | # If you need more information about configurations
 3 | # or implementing the sample code, visit the AWS docs:
 4 | # https://aws.amazon.com/developer/language/python/
 5 | 
 6 | import json
 7 | import logging
 8 | import os
 9 | 
10 | import boto3
11 | 
12 | 
13 | def get_mongo_secret_url():
14 |     mongo_url = os.getenv("MONGODB_URL")
15 |     if mongo_url:
16 |         return mongo_url
17 |     try:
18 |         secret_name = "prod/frontend/mongo_password"
19 |         region_name = "us-west-2"
20 | 
21 |         # Create a Secrets Manager client
22 |         session = boto3.session.Session()
23 |         client = session.client(service_name="secretsmanager", region_name=region_name)
24 | 
25 |         get_secret_value_response = client.get_secret_value(SecretId=secret_name)
26 | 
27 |         # Decrypts secret using the associated KMS key.
28 |         secret = get_secret_value_response["SecretString"]
29 | 
30 |         secret_dict = json.loads(secret)
31 |         mongo_url = secret_dict.get("url")
32 |         return mongo_url
33 |     except Exception as e:
34 |         # Fail quietly if we can't get the secret
35 |         logging.warning(f"Failed to retrieve mongo secret, Exception: {e}")
36 | 


--------------------------------------------------------------------------------
/llmadmin/common/llm_event.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from enum import Enum
 3 | from typing import Dict, List, Optional
 4 | 
 5 | from pydantic import BaseModel
 6 | 
 7 | 
 8 | class Flag(Enum):
 9 |     HATE = "hate"
10 |     OBSCENE = "obscene"
11 |     WRONG_LANGUAGE = "wrong-language"
12 |     NONFACTUAL = "non-factual"
13 | 
14 | 
15 | class Vote(BaseModel):
16 |     llm: str
17 |     score: float
18 | 
19 | 
20 | class LlmResponse(BaseModel):
21 |     model_id: str
22 |     text: str
23 |     model_config: Optional[Dict]
24 |     gen_stats: Optional[Dict]
25 | 
26 | 
27 | class LlmEvent(BaseModel):
28 |     created_at: datetime
29 |     # Name of the project
30 |     project_name: str
31 | 
32 |     # Identifier for a session
33 |     session_id: Optional[str]
34 | 
35 |     # unique string representing this event
36 |     instance_id: str
37 | 
38 |     # Prompt given by the user
39 |     user_prompt: str
40 |     responses: List[LlmResponse]
41 | 
42 |     # Vote is a dictionary by llm and the votes
43 |     # that model got. Typically, this is 1.
44 |     votes: Optional[List[Vote]]
45 |     vote_comments: Optional[Dict[str, str]]
46 | 
47 |     # Key: llm
48 |     # Value: list of flags
49 |     flag: Optional[Dict[str, List[Flag]]]
50 | 
51 |     # Key: llm
52 |     # Value: Comment for each llm
53 |     flag_comments: Optional[Dict[str, str]]
54 | 


--------------------------------------------------------------------------------
/models/ft--sequenceclassification--bert-base-uncased-lora.yaml:
--------------------------------------------------------------------------------
 1 | model_config:
 2 |   warmup: True
 3 |   model_task: fill-mask
 4 |   model_id: bert-base-uncased
 5 |   initialization:
 6 |     initializer:
 7 |       type: Finetune
 8 |       dtype: float32
 9 |       from_pretrained_kwargs:
10 |         trust_remote_code: true
11 | ft_config:
12 |   # ft_stage: "sft"
13 |   ft_method: "lora"
14 |   ft_task: "sequenceclassification"
15 |   data_config:
16 |     data_path: glue
17 |     subset: mrpc
18 |     local_path: dataset/glue/mrpc/1.0.0
19 |     num_row: 30     # 0: Train with all data.  >0: Test with $num_row data
20 |     input_columns: 
21 |       - "sentence"
22 |     validation_column: validation
23 |   train_config:
24 |     lora_config: 
25 |       r: 1  # Lora attention dimension
26 |       task_type: SEQ_CLS   #SEQ_CLS, SEQ_2_SEQ_LM, CAUSAL_LM, TOKEN_CLS, QUESTION_ANS, FEATURE_EXTRACTION
27 |       lora_alpha: 1  # The alpha parameter for Lora scaling
28 |       lora_dropout: 0.1   # The dropout probability for Lora layers
29 |     base_config:
30 |       checkpoints_output_dir: finetune_models/
31 |       per_device_train_batch_size: 8
32 |       learning_rate: 2e-5
33 |       num_train_epochs: 2
34 |       weight_decay: 0.01
35 |       logging_strategy: steps
36 |       evaluation_strategy: steps
37 |       save_strategy: steps
38 |       save_steps: 100
39 | 


--------------------------------------------------------------------------------
/models/ft--maskedlm--distilbert-base-uncased.yaml:
--------------------------------------------------------------------------------
 1 | model_config:
 2 |   warmup: True
 3 |   model_task: fill-mask
 4 |   model_id: distilbert-base-uncased
 5 |   max_input_words: 800
 6 |   initialization:
 7 |     runtime_env:
 8 |       pip:
 9 |         - deepspeed==0.9.2
10 |         - accelerate
11 |     s3_mirror_config:
12 |       bucket_uri: /tmp/hub/models/distilbert-base-uncased/
13 |     #   bucket_uri: s3://large-dl-models-mirror/models--amazon--LightGPT/main-safetensors/
14 |     initializer:
15 |       type: Finetune
16 |       dtype: float32
17 |       from_pretrained_kwargs:
18 |         # use_cache: true
19 |         trust_remote_code: true
20 |       # use_kernel: true   # for deepspped type only
21 |       # max_tokens: 1536   # for deepspped type only
22 | ft_config:
23 |   ft_task: maskedlm
24 |   data_config:
25 |     data_path: imdb
26 |     subset:
27 |     local_path: /tmp/hub/dataset/imdb/plain_text/1.0.0
28 |     num_row: 30
29 |     # train_file:
30 |     # validation_file:
31 |     input_columns: 
32 |       - "sentence"
33 |     validation_column: validation
34 |     # labels
35 |   train_config:
36 |     base_config:
37 |       checkpoints_output_dir: /tmp/finetune
38 |       per_device_train_batch_size: 32
39 |       learning_rate: 2e-5
40 |       num_train_epochs: 2
41 |       weight_decay: 0.01
42 |       remove_unused_columns: false
43 |       logging_strategy: steps
44 |       evaluation_strategy: steps
45 |       save_strategy: steps
46 |       save_steps: 100
47 | scaling_config:
48 |   num_workers: 7
49 |   num_gpus_per_worker: 0
50 |   num_cpus_per_worker: 1   # for infrence
51 |   # resources_per_worker:
52 |   #   accelerator_type_cpu: 0.01
53 |   ray_actor_options:
54 |     num_cpus: 0.1
55 | 


--------------------------------------------------------------------------------
/models/ft--text-generation--THUDM-chatglm3-6b.yaml:
--------------------------------------------------------------------------------
 1 | model_config:
 2 |   warmup: True
 3 |   model_task: text-generation
 4 |   model_id: THUDM/chatglm3-6b
 5 |   max_input_words: 800
 6 |   quantization_bit: 4
 7 |   initialization:
 8 |     # s3_mirror_config:
 9 |       # endpoint_url: http://39.107.108.170:9000 # Optinal for custom S3 storage endpoint url 
10 |       # bucket_uri: s3://opt-125m/facemodel/  # Must include hash file with commit id in repo
11 |       # bucket_uri: /root/.cache/hub/ZhipuAI/chatglm3-6b/ # Local path of model with hash file
12 |     initializer:
13 |       type: Finetune
14 |       dtype: float32
15 |       from_pretrained_kwargs:
16 |         trust_remote_code: true
17 | ft_config:
18 |   ft_task: "text-generation"
19 |   ft_method: "lora"
20 |   data_config:
21 |     data_path: AdvertiseGen
22 |     local_path: dataset/AdvertiseGen
23 |     num_row: 30     # 0: Train with all data.  >0: Test with $num_row data
24 |     input_columns: 
25 |       - "content"
26 |     validation_column: summary
27 |   train_config:
28 |     lora_config: 
29 |       r: 1  # Lora attention dimension
30 |       task_type: CAUSAL_LM   #SEQ_CLS, SEQ_2_SEQ_LM, CAUSAL_LM, TOKEN_CLS, QUESTION_ANS, FEATURE_EXTRACTION
31 |       lora_alpha: 1  # The alpha parameter for Lora scaling
32 |       lora_dropout: 0.1   # The dropout probability for Lora layers
33 |     base_config:
34 |       max_length: 500
35 |       checkpoints_output_dir: /tmp/finetune
36 |       per_device_train_batch_size: 1
37 |       per_device_eval_batch_size: 1
38 |       learning_rate: 2e-5
39 |       num_train_epochs: 2
40 |       weight_decay: 0.01
41 |       remove_unused_columns: true
42 |       logging_strategy: steps
43 |       evaluation_strategy: steps
44 |       save_strategy: steps
45 |       save_steps: 25
46 |       max_steps: 50
47 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import find_packages, setup
 3 | this_directory = os.path.abspath(os.path.dirname(__file__))
 4 | with open(os.path.join(this_directory, "requirements.txt"), encoding="utf-8") as f:
 5 |     INSTALL_REQUIRES = f.read().splitlines()
 6 | 
 7 | EXTRAS_REQUIRE = {
 8 |     "dev": INSTALL_REQUIRES + [
 9 |         "pre-commit",
10 |         "ruff==0.0.270",
11 |         "black==23.3.0",
12 |     ],
13 |     "test": INSTALL_REQUIRES + [
14 |         "pytest",
15 |     ],
16 |     "docs": INSTALL_REQUIRES + [
17 |         "mkdocs-material",
18 |     ],
19 | }
20 | 
21 | setup(
22 |     name="llmfinetune",
23 |     version="0.0.1",
24 |     description="A framework to finetune LLMs",
25 |     long_description=open("README.md", "r", encoding="utf-8").read(),
26 |     long_description_content_type="text/markdown",
27 |     packages=find_packages(include="llmadmin*"),
28 |     keywords=["ChatGLM", "BaiChuan", "LLaMA", "BLOOM", "Falcon",
29 |               "LLM", "ChatGPT", "transformer", "pytorch", "deep learning"],
30 |     include_package_data=True,
31 |     package_data={"llmadmin": ["models/*"]},
32 |     entry_points={
33 |         "console_scripts": [
34 |             "llmfinetune=llmadmin.api.cli:app",
35 |         ]
36 |     },
37 |     extras_require=EXTRAS_REQUIRE,
38 |     install_requires=INSTALL_REQUIRES,
39 |     python_requires=">=3.8",
40 |     classifiers=[
41 |         "Development Status :: 3 - Alpha",
42 |         "Intended Audience :: Developers",
43 |         "Intended Audience :: Education",
44 |         "Intended Audience :: Science/Research",
45 |         "License :: OSI Approved :: Apache Software License",
46 |         "Operating System :: OS Independent",
47 |         "Programming Language :: Python :: 3.8",
48 |         "Programming Language :: Python :: 3.9",
49 |         "Programming Language :: Python :: 3.10",
50 |         "Programming Language :: Python :: 3.11",
51 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
52 |     ]
53 | )
54 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/_base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from llmadmin.backend.logger import get_logger
 3 | from datasets import DatasetDict, Dataset, IterableDatasetDict, IterableDataset
 4 | from typing import Union, TYPE_CHECKING, List
 5 | from transformers import PreTrainedModel, PreTrainedTokenizer
 6 | from llmadmin.backend.server.models import FTApp
 7 | import torch
 8 | from llmadmin.backend.llm.initializers import get_initializer_cls_by_name
 9 | 
10 | if TYPE_CHECKING:
11 |     from ..initializers._base import LLMInitializer
12 | 
13 | logger = get_logger(__name__)
14 | 
15 | class BaseFT(ABC):
16 |     """base fine tune class.
17 | 
18 |     Args:
19 |     """
20 | 
21 |     def __init__(
22 |         self,
23 |         ftapp: FTApp,
24 |         ) -> None:
25 |         self.ftapp = ftapp
26 |         self.data_conf = ftapp.ft_config.data_config
27 |         self.train_conf = ftapp.ft_config.train_config.base_config
28 |         self.model_config = ftapp.model_config
29 |         self.ft_task = ftapp.ft_config.ft_task
30 |         self.scale_config = ftapp.scaling_config
31 | 
32 |         # Lazy import so that the new cache location is used
33 |         torch.backends.cuda.matmul.allow_tf32 = True
34 |         device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
35 |     
36 |         initializer_name = self.model_config.initialization.initializer
37 |         if not isinstance(initializer_name, str):
38 |             initializer_name = initializer_name.type
39 |         
40 |         logger.info(f"Finetune initializer name '{initializer_name}' on device {device}")
41 |         initializer = get_initializer_cls_by_name(initializer_name)(
42 |             device=device,
43 |             world_size=1, # fake
44 |             **self.model_config.initialization.initializer.get_initializer_kwargs(),
45 |         )
46 | 
47 |         self.initializer = initializer
48 | 
49 |     @abstractmethod
50 |     def train(self):
51 |         pass
52 |     
53 | 
54 | 
55 |     
56 | 
57 |         


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/test/test_seq_cls_bert_yelp.py:
--------------------------------------------------------------------------------
 1 | # Adapted from Hugging Face tutorial: https://huggingface.co/docs/transformers/training
 2 | 
 3 | import numpy as np
 4 | import evaluate
 5 | from datasets import load_dataset
 6 | from transformers import (
 7 |     Trainer,
 8 |     TrainingArguments,
 9 |     AutoTokenizer,
10 |     AutoModelForSequenceClassification,
11 | )
12 | 
13 | num_labels = 5
14 | modelPath = "bert-base-cased"
15 | modelPath = "/Users/hub/models/bert-base-cased"
16 | dsPath = "yelp_review_full"
17 | dsPath = "/Users/hub/models/yelp_review_full/1.0.0"
18 | 
19 | # Datasets
20 | dataset = load_dataset(dsPath)
21 | print('Loaded dataset', dataset)
22 | 
23 | tokenizer = AutoTokenizer.from_pretrained(modelPath)
24 | 
25 | def tokenize_function(examples):
26 |     return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
27 | 
28 | count = 10
29 | small_train_dataset = dataset["train"].select(range(count)).map(tokenize_function, batched=True)
30 | small_eval_dataset = dataset["test"].select(range(count)).map(tokenize_function, batched=True)
31 | print('small train dataset', small_train_dataset)
32 | print('small eval dataset', small_eval_dataset)
33 | 
34 | # Model
35 | model = AutoModelForSequenceClassification.from_pretrained(modelPath, num_labels=num_labels)
36 | 
37 | # Metrics
38 | metric = evaluate.load("accuracy")
39 | 
40 | def compute_metrics(eval_pred):
41 |     logits, labels = eval_pred
42 |     predictions = np.argmax(logits, axis=-1)
43 |     return metric.compute(predictions=predictions, references=labels)
44 | 
45 | # Hugging Face Trainer
46 | training_args = TrainingArguments(
47 |     output_dir="test_trainer", 
48 |     evaluation_strategy="epoch", 
49 |     report_to="none"
50 | )
51 | 
52 | trainer = Trainer(
53 |     model=model,
54 |     args=training_args,
55 |     train_dataset=small_train_dataset,
56 |     eval_dataset=small_eval_dataset,
57 |     compute_metrics=compute_metrics,
58 | )
59 | 
60 | # Start Training
61 | trainer.train()
62 | 


--------------------------------------------------------------------------------
/llmadmin/frontend/javascript_loader.py:
--------------------------------------------------------------------------------
 1 | # https://github.com/gradio-app/gradio/discussions/2932
 2 | import mimetypes
 3 | import os
 4 | 
 5 | import gradio.routes
 6 | 
 7 | mimetypes.init()
 8 | mimetypes.add_type("application/javascript", ".js")
 9 | 
10 | 
11 | class ScriptLoader:
12 |     path_map = {
13 |         "js": os.path.abspath(os.path.join(os.path.dirname(__file__), "javascript")),
14 |         "py": os.path.abspath(os.path.join(os.path.dirname(__file__), "python")),
15 |     }
16 | 
17 |     def __init__(self, script_type):
18 |         self.script_type = script_type
19 |         self.path = ScriptLoader.path_map[script_type]
20 |         self.loaded_scripts = []
21 | 
22 |     @staticmethod
23 |     def get_scripts(path: str, file_type: str) -> list[tuple[str, str]]:
24 |         scripts = []
25 |         dir_list = [os.path.join(path, f) for f in os.listdir(path)]
26 |         files_list = [f for f in dir_list if os.path.isfile(f)]
27 |         for s in files_list:
28 |             # Dont forget the "." for file extension
29 |             if os.path.splitext(s)[1] == f".{file_type}":
30 |                 scripts.append((s, os.path.basename(s)))
31 |         return scripts
32 | 
33 | 
34 | class JavaScriptLoader(ScriptLoader):
35 |     def __init__(self):
36 |         super().__init__("js")
37 |         self.original_template = gradio.routes.templates.TemplateResponse
38 |         self.load_js()
39 |         gradio.routes.templates.TemplateResponse = self.template_response
40 | 
41 |     def load_js(self):
42 |         js_scripts = ScriptLoader.get_scripts(self.path, self.script_type)
43 |         for file_path, file_name in js_scripts:
44 |             with open(file_path, "r", encoding="utf-8") as file:
45 |                 self.loaded_scripts.append(
46 |                     f"\n<!--{file_name}-->\n<script>\n{file.read()}\n</script>"
47 |                 )
48 | 
49 |     def template_response(self, *args, **kwargs):
50 |         response = self.original_template(*args, **kwargs)
51 |         response.body = response.body.replace(
52 |             "</head>".encode("utf-8"),
53 |             f"{''.join(self.loaded_scripts)}\n</head>".encode("utf-8"),
54 |         )
55 |         response.init_headers()
56 |         return response
57 | 


--------------------------------------------------------------------------------
/llmadmin/frontend/app.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import random
 3 | import re
 4 | import uuid
 5 | from typing import Any, Dict, List
 6 | import ray
 7 | import requests
 8 | 
 9 | from llmadmin.common.backend import get_llmadmin_backend
10 | from llmadmin.common.constants import (
11 |     AVIARY_DESC,
12 |     CSS,
13 |     EXAMPLES_IF,
14 |     EXAMPLES_QA,
15 |     EXAMPLES_ST,
16 |     HEADER,
17 |     LOGO_ANYSCALE,
18 |     LOGO_GITHUB,
19 |     LOGO_RAY,
20 |     LOGO_RAY_TYPEFACE,
21 |     MODEL_DESCRIPTION_FORMAT,
22 |     MODEL_DESCRIPTIONS_HEADER,
23 |     MODELS,
24 |     NUM_LLM_OPTIONS,
25 |     PROJECT_NAME,
26 |     SELECTION_DICT,
27 |     SUB_HEADER,
28 | )
29 | from llmadmin.frontend.javascript_loader import JavaScriptLoader
30 | from llmadmin.frontend.leaderboard import DummyLeaderboard, Leaderboard
31 | from llmadmin.frontend.mongo_secrets import get_mongo_secret_url
32 | from llmadmin.frontend.utils import (
33 |     DEFAULT_STATS,
34 |     LOGGER,
35 |     THEME,
36 |     blank,
37 |     deactivate_buttons,
38 |     gen_stats,
39 |     log_flags,
40 |     paused_logger,
41 |     select_button,
42 |     unset_buttons,
43 | )
44 | 
45 | std_logger = logging.getLogger("ray.logger")
46 | 
47 | @ray.remote(num_cpus=0)
48 | def completions(bakend, prompt, llm, index):
49 |     try:
50 |         out = bakend.completions(prompt=prompt, llm=llm)
51 |     except Exception as e:
52 |         if isinstance(e, requests.ReadTimeout) or (
53 |             hasattr(e, "response")
54 |             and ("timeout" in e.response or e.response.status_code in (408, 504))
55 |         ):
56 |             out = (
57 |                 "[LLM-ADMIN] The request timed out. This usually means the server "
58 |                 "is experiencing a higher than usual load. "
59 |                 "Please try again in a few minutes."
60 |             )
61 |         elif hasattr(e, "response"):
62 |             out = (
63 |                 f"[LLM-ADMIN] Backend returned an error. "
64 |                 f"Status code: {e.response.status_code}"
65 |                 f"\nResponse: {e.response.text.split('raise ')[-1]}"
66 |             ).replace("\n", " ")
67 |         else:
68 |             out = f"[LLM-ADMIN] An error occurred. Please try again.\nError: {e}"
69 |         out = {"error": out}
70 |     return out, index


--------------------------------------------------------------------------------
/llmadmin/frontend/javascript/llmadmin.js:
--------------------------------------------------------------------------------
 1 | // Set favicon
 2 | const FAVICON =
 3 |   "data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22>🦜</text></svg>";
 4 | function setFavicon(link) {
 5 |   let favicon = document.querySelector('link[rel="icon"]');
 6 | 
 7 |   if (favicon) {
 8 |     favicon.href = link;
 9 |   } else {
10 |     favicon = document.createElement("link");
11 |     favicon.rel = "icon";
12 |     favicon.href = link;
13 | 
14 |     document.head.appendChild(favicon);
15 |   }
16 | }
17 | // setFavicon(FAVICON);
18 | 
19 | // Get news
20 | const NEWS_URL = "https://api.github.com/repos/ray-project/llmadmin/issues/8";
21 | function getNews(newsUrl) {
22 |   return fetch(newsUrl)
23 |     .then((response) => {
24 |       if (!response.ok) {
25 |         throw new Error("Unable to fetch news.");
26 |       }
27 |       return response.text();
28 |     })
29 |     .then((data) => {
30 |       return (title = JSON.parse(data)["title"]);
31 |     })
32 |     .catch((error) => console.error("Unable to parse response: ", error));
33 | }
34 | 
35 | // Wait for the ticker div to be added to DOM to set the news content
36 | const observer = new MutationObserver((mutationsList, observer) => {
37 |   for (let mutation of mutationsList) {
38 |     if (mutation.type === "childList") {
39 |       let element = document.getElementsByClassName("ticker");
40 |       if (element.length > 0) {
41 |         getNews(NEWS_URL).then((newsTitle) => {
42 |           document.getElementsByClassName("ticker")[0].innerHTML =
43 |             "\uD83D\uDCE3 " + newsTitle;
44 |         });
45 |         observer.disconnect(); 
46 |         break;
47 |       }
48 |     }
49 |   }
50 | });
51 | 
52 | (function () {
53 |   // Add Google Tag Manager
54 |   const head = document.getElementsByTagName("head")[0];
55 |   var gtm = document.createElement("script");
56 |   gtm.text =
57 |     "(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-5ZPDX2P');";
58 |   head.insertBefore(gtm, head.children[0]);
59 | 
60 |   document.addEventListener("DOMContentLoaded", function () {
61 |     observer.observe(document.body, { childList: true, subtree: true });
62 |   });
63 | })();


--------------------------------------------------------------------------------
/llmadmin/frontend/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | # import gradio as gr
 4 | 
 5 | from llmadmin.common.constants import (
 6 |     G5_COST_PER_S_IN_DOLLARS,
 7 |     NUM_LLM_OPTIONS,
 8 |     PROJECT_NAME,
 9 | )
10 | from llmadmin.frontend.mongo_logger import MongoLogger
11 | from llmadmin.frontend.mongo_secrets import get_mongo_secret_url
12 | 
13 | LOGGER = None
14 | 
15 | # MONGODB_URL = get_mongo_secret_url()
16 | # if MONGODB_URL:
17 | #     LOGGER = MongoLogger(url=MONGODB_URL, project_name=PROJECT_NAME)
18 | # else:
19 | #     print("No MongoDB logger defined, will default to the CSVLogger")
20 | #     LOGGER = gr.CSVLogger()
21 | # LOGGER = gr.CSVLogger()
22 | 
23 | 
24 | DEFAULT_STATS = t = """
25 |         | <!-- --> | <!-- --> |
26 |         |---|---|
27 |         | Latency [s] | - |
28 |         | Cost [$] | - |
29 |         | Tokens (i/o) | - |
30 |         | Per 1K Tokens [$] | - |
31 | """
32 | 
33 | 
34 | def gen_stats(dictionary):
35 |     cost_per_k = (
36 |         dictionary["total_time"]
37 |         * G5_COST_PER_S_IN_DOLLARS
38 |         / dictionary["num_total_tokens"]
39 |         * 1000
40 |     )
41 | 
42 |     return f"""
43 |             | <!-- --> | <!-- --> |
44 |             |---|---|
45 |             | Lat [s] | {dictionary['total_time']:.1f} |
46 |             | Cost [$] | {dictionary['total_time'] * G5_COST_PER_S_IN_DOLLARS:.4f} |
47 |             | Tokens (i/o) | {dictionary['num_total_tokens']:.1f} |
48 |             | Per 1K Tok [$] | {cost_per_k:.4f} |
49 |     """
50 | 
51 | 
52 | def blank():
53 |     return ""
54 | 
55 | 
56 | # def select_button(button):
57 | #     return button, gr.Button.update(variant="primary")
58 | 
59 | 
60 | # def deactivate_buttons():
61 | #     return [gr.Button.update(interactive=False)] * NUM_LLM_OPTIONS
62 | 
63 | 
64 | # def unset_buttons():
65 | #     return [gr.Button.update(variant="secondary", interactive=True)] * NUM_LLM_OPTIONS
66 | 
67 | 
68 | # def paused_logger(*args):
69 | #     time.sleep(1)
70 | #     LOGGER.flag(*args)
71 | 
72 | 
73 | # def log_flags(*args):
74 | #     LOGGER.flag(args)
75 | 
76 | 
77 | # THEME = gr.themes.Default(
78 | #     primary_hue="blue",
79 | #     secondary_hue="blue",
80 | # ).set(
81 | #     border_color_accent="blue",
82 | #     shadow_spread="20",
83 | #     shadow_spread_dark="0",
84 | #     button_primary_background_fill="*primary_200",
85 | #     button_primary_background_fill_dark="*primary_700",
86 | #     button_primary_border_color_dark="*primary_600",
87 | # )
88 | 


--------------------------------------------------------------------------------
/llmadmin/backend/server/_batch.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from dataclasses import dataclass, field
 3 | from enum import IntEnum
 4 | from functools import wraps
 5 | from typing import Any, Callable, List, Optional, Tuple, Type
 6 | 
 7 | # TODO: Upstream to Serve.
 8 | 
 9 | 
10 | def extract_self_if_method_call(args: List[Any], func: Callable) -> Optional[object]:
11 |     """Check if this is a method rather than a function.
12 | 
13 |     Does this by checking to see if `func` is the attribute of the first
14 |     (`self`) argument under `func.__name__`. Unfortunately, this is the most
15 |     robust solution to this I was able to find. It would also be preferable
16 |     to do this check when the decorator runs, rather than when the method is.
17 | 
18 |     Returns the `self` object if it's a method call, else None.
19 | 
20 |     Arguments:
21 |         args: arguments to the function/method call.
22 |         func: the unbound function that was called.
23 |     """
24 |     if len(args) > 0:
25 |         method = getattr(args[0], func.__name__, False)
26 |         if method:
27 |             wrapped = getattr(method, "__wrapped__", False)
28 |             if wrapped and wrapped == func:
29 |                 return args[0]
30 | 
31 |     return None
32 | 
33 | 
34 | class QueuePriority(IntEnum):
35 |     """Lower value = higher priority"""
36 | 
37 |     GENERATE_TEXT = 0
38 |     BATCH_GENERATE_TEXT = 1
39 | 
40 | 
41 | @dataclass(order=True)
42 | class _PriorityWrapper:
43 |     """Wrapper allowing for priority queueing of arbitrary objects."""
44 | 
45 |     obj: Any = field(compare=False)
46 |     priority: int = field(compare=True)
47 | 
48 | 
49 | class PriorityQueueWithUnwrap(asyncio.PriorityQueue):
50 |     def get_nowait(self) -> Any:
51 |         # Get just the obj from _PriorityWrapper
52 |         ret: _PriorityWrapper = super().get_nowait()
53 |         return ret.obj
54 | 
55 | 
56 | def _validate_max_batch_size(max_batch_size):
57 |     if not isinstance(max_batch_size, int):
58 |         if isinstance(max_batch_size, float) and max_batch_size.is_integer():
59 |             max_batch_size = int(max_batch_size)
60 |         else:
61 |             raise TypeError("max_batch_size must be integer >= 1")
62 | 
63 |     if max_batch_size < 1:
64 |         raise ValueError("max_batch_size must be an integer >= 1")
65 | 
66 | 
67 | def _validate_batch_wait_timeout_s(batch_wait_timeout_s):
68 |     if not isinstance(batch_wait_timeout_s, (float, int)):
69 |         raise TypeError("batch_wait_timeout_s must be a float >= 0")
70 | 
71 |     if batch_wait_timeout_s < 0:
72 |         raise ValueError("batch_wait_timeout_s must be a float >= 0")
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/sequenceclassification_glue_cola.py:
--------------------------------------------------------------------------------
 1 | from ._base import Task
 2 | from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
 3 | from typing import Any
 4 | import pandas as pd
 5 | import evaluate
 6 | import numpy as np
 7 | 
 8 | 
 9 | class SequenceclassificationGlueCola(Task):
10 |     AUTO_MODEL_CLASS = AutoModelForSequenceClassification
11 | 
12 |     DATASET_PATH = "glue"
13 |     DATASET_NAME = "cola"
14 | 
15 |     def get_data_proprocess(self) -> Any:
16 |         tokenizer = self.tokenizer
17 | 
18 |         # adopt python decorator TODO
19 |         def preprocess_function(examples: pd.DataFrame):            
20 |             # examples = examples.to_dict("list")
21 |             ret = tokenizer(examples["sentence"], truncation=True)
22 |                 
23 |             # Add back the original columns
24 |             ret = {**examples, **ret}
25 |             return pd.DataFrame.from_dict(ret)
26 |         
27 |         return preprocess_function
28 | 
29 |     def get_compute_metrics(self) -> Any:
30 |         DATASET_PATH = self.DATASET_PATH
31 |         DATASET_NAME = self.DATASET_NAME
32 | 
33 |         def compute_metrics(eval_preds):
34 |             metric = evaluate.load(DATASET_PATH, DATASET_NAME)
35 |             logits, labels = eval_preds
36 |             predictions = np.argmax(logits, axis=-1)
37 |             return metric.compute(predictions=predictions, references=labels)
38 |         
39 |         return compute_metrics
40 | 
41 |     def get_data_collator(self) -> Any:
42 |         data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
43 |         return data_collator
44 |     
45 |     def training_key(self):
46 |         """
47 |         :return: Iterable[obj]
48 |             A iterable of any object, that doc_to_text can handle
49 |         """
50 |         return "train"
51 | 
52 |     def validation_key(self):
53 |         """
54 |         :return: Iterable[obj]
55 |             A iterable of any object, that doc_to_text can handle
56 |         """
57 |         return "validation"
58 | 
59 |     def getTrainDataSet(self):
60 |         return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True)
61 | 
62 |     def getEvalDataSet(self):
63 |         return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True)
64 | 
65 |     def getSmallTrainDataSet(self, len: int):
66 |         return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
67 | 
68 |     def getSmallEvalDataSet(self, len: int):
69 |         return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/sequenceclassification_glue_mrpc.py:
--------------------------------------------------------------------------------
 1 | from ._base import Task
 2 | from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
 3 | from typing import Any
 4 | import pandas as pd
 5 | import evaluate
 6 | import numpy as np
 7 | 
 8 | 
 9 | class SequenceclassificationGlueMrpc(Task):
10 |     AUTO_MODEL_CLASS = AutoModelForSequenceClassification
11 | 
12 |     DATASET_PATH = "glue"
13 |     DATASET_NAME = "mrpc"
14 |     FROM_PRETRAINED_KWARGS = {
15 |         # "num_labels": 2
16 |     }
17 | 
18 |     def get_data_proprocess(self) -> Any:
19 |         tokenizer = self.tokenizer
20 | 
21 |         # adopt python decorator TODO
22 |         def preprocess_function(examples: pd.DataFrame):            
23 |             # examples = examples.to_dict("list")
24 |             ret = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=self.ft_config.train_config.base_config.max_length)
25 |                 
26 |             # Add back the original columns
27 |             ret = {**examples, **ret}
28 |             return pd.DataFrame.from_dict(ret)
29 |         
30 |         return preprocess_function
31 | 
32 |     def get_compute_metrics(self) -> Any:
33 |         DATASET_PATH = self.DATASET_PATH
34 |         DATASET_NAME = self.DATASET_NAME
35 | 
36 |         def compute_metrics(eval_preds):
37 |             metric = evaluate.load(DATASET_PATH, DATASET_NAME)
38 |             logits, labels = eval_preds
39 |             predictions = np.argmax(logits, axis=-1)
40 |             return metric.compute(predictions=predictions, references=labels)
41 |         
42 |         return compute_metrics
43 | 
44 |     def get_data_collator(self) -> Any:
45 |         data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
46 |         return data_collator
47 |     
48 |     def training_key(self):
49 |         """
50 |         :return: Iterable[obj]
51 |             A iterable of any object, that doc_to_text can handle
52 |         """
53 |         return "train"
54 | 
55 |     def validation_key(self):
56 |         """
57 |         :return: Iterable[obj]
58 |             A iterable of any object, that doc_to_text can handle
59 |         """
60 |         return "validation"
61 | 
62 |     def getTrainDataSet(self):
63 |         return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True)
64 | 
65 |     def getEvalDataSet(self):
66 |         return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True)
67 | 
68 |     def getSmallTrainDataSet(self, len: int):
69 |         return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
70 | 
71 |     def getSmallEvalDataSet(self, len: int):
72 |         return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
73 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/initializers/_base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | from transformers import PreTrainedModel, PreTrainedTokenizer
 6 | 
 7 | from llmadmin.backend.logger import get_logger
 8 | 
 9 | logger = get_logger(__name__)
10 | 
11 | 
12 | class LLMInitializer(ABC):
13 |     """Initialize model and tokenizer and place them on the correct device.
14 | 
15 |     Args:
16 |         device (torch.device): Device to place model and tokenizer on.
17 |         world_size (int): Number of GPUs to use.
18 |     """
19 | 
20 |     def __init__(
21 |         self,
22 |         device: torch.device,
23 |         world_size: int,
24 |     ):
25 |         self.device = device
26 |         self.world_size = world_size
27 | 
28 |     def load(self, model_id: str) -> Tuple["PreTrainedModel", "PreTrainedTokenizer"]:
29 |         """Load model and tokenizer.
30 | 
31 |         Args:
32 |             model_id (str): Hugging Face model ID.
33 |         """
34 |         model = self.load_model(model_id)
35 |         tokenizer = self.load_tokenizer(model_id)
36 |         return self.postprocess(model, tokenizer)
37 | 
38 |     @abstractmethod
39 |     def load_model(self, model_id: str) -> "PreTrainedModel":
40 |         """Load model.
41 | 
42 |         Args:
43 |             model_id (str): Hugging Face model ID.
44 |         """
45 |         pass
46 | 
47 |     @abstractmethod
48 |     def load_tokenizer(self, tokenizer_id: str) -> "PreTrainedTokenizer":
49 |         """Load tokenizer.
50 | 
51 |         Args:
52 |             tokenizer_id (str): Hugging Face tokenizer name.
53 |         """
54 |         pass
55 | 
56 |     def postprocess(
57 |         self, model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer"
58 |     ) -> Tuple["PreTrainedModel", "PreTrainedTokenizer"]:
59 |         """Postprocess model and tokenizer.
60 | 
61 |         Args:
62 |             model (PreTrainedModel): Model to postprocess.
63 |             tokenizer (PreTrainedTokenizer): Tokenizer to postprocess.
64 |         """
65 |         return self.postprocess_model(model), self.postprocess_tokenizer(tokenizer)
66 | 
67 |     def postprocess_model(self, model: "PreTrainedModel") -> "PreTrainedModel":
68 |         """Postprocess model.
69 | 
70 |         Args:
71 |             model (PreTrainedModel): Model to postprocess.
72 |         """
73 |         return model
74 | 
75 |     def postprocess_tokenizer(
76 |         self, tokenizer: "PreTrainedTokenizer"
77 |     ) -> "PreTrainedTokenizer":
78 |         """Postprocess tokenizer.
79 | 
80 |         Args:
81 |             tokenizer (PreTrainedTokenizer): Tokenizer to postprocess.
82 |         """
83 |         return tokenizer
84 | 
85 |     def get_model_init_kwargs(self) -> dict:
86 |         """Load tokenizer.
87 | 
88 |         Args:
89 |             tokenizer_id (str): Hugging Face tokenizer name.
90 |         """
91 |         return {}


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/sequenceclassification_yelp_review_full.py:
--------------------------------------------------------------------------------
 1 | from ._base import Task
 2 | from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
 3 | from typing import Any
 4 | import pandas as pd
 5 | import evaluate
 6 | import numpy as np
 7 | 
 8 | 
 9 | class SequenceclassificationYelpReviewFull(Task):
10 |     AUTO_MODEL_CLASS = AutoModelForSequenceClassification
11 | 
12 |     DATASET_PATH = "yelp_review_full"
13 |     DATASET_NAME = ""
14 |     FROM_PRETRAINED_KWARGS = {
15 |         "num_labels": 5
16 |     }
17 | 
18 |     def get_data_proprocess(self) -> Any:
19 |         tokenizer = self.tokenizer
20 | 
21 |         # adopt python decorator TODO
22 |         def preprocess_function(examples: pd.DataFrame):            
23 |             examples = examples.to_dict("list")
24 |             ret = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
25 |                 
26 |             # Add back the original columns
27 |             ret = {**examples, **ret}
28 |             return pd.DataFrame.from_dict(ret)
29 |         
30 |         return preprocess_function
31 | 
32 |     def get_compute_metrics(self) -> Any:
33 |         DATASET_PATH = self.DATASET_PATH
34 |         DATASET_NAME = self.DATASET_NAME
35 | 
36 |         def compute_metrics(eval_preds):
37 |             # metric = evaluate.load(DATASET_PATH, DATASET_NAME)
38 |             metric = evaluate.load("accuracy")
39 |             logits, labels = eval_preds
40 |             predictions = np.argmax(logits, axis=-1)
41 |             return metric.compute(predictions=predictions, references=labels)
42 |         
43 |         return compute_metrics
44 | 
45 |     def get_data_collator(self) -> Any:
46 |         data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
47 |         return data_collator
48 |     
49 |     def training_key(self):
50 |         """
51 |         :return: Iterable[obj]
52 |             A iterable of any object, that doc_to_text can handle
53 |         """
54 |         return "train"
55 | 
56 |     def validation_key(self):
57 |         """
58 |         :return: Iterable[obj]
59 |             A iterable of any object, that doc_to_text can handle
60 |         """
61 |         return "validation"
62 |     
63 |     def tokenize_function(self, examples):
64 |         return self.tokenizer(examples["text"], padding="max_length", truncation=True, max_length=self.ft_config.train_config.base_config.max_length)
65 |     
66 |     def getTrainDataSet(self):
67 |         return self.dataset[self.training_key()].map(self.tokenize_function, batched=True)
68 | 
69 |     def getEvalDataSet(self):
70 |         return self.dataset[self.validation_key()].map(self.tokenize_function, batched=True)
71 | 
72 |     def getSmallTrainDataSet(self, len: int):
73 |         return self.dataset[self.training_key()].select(range(len)).map(self.tokenize_function, batched=True)
74 | 
75 |     def getSmallEvalDataSet(self, len: int):
76 |         return self.dataset[self.validation_key()].select(range(len)).map(self.tokenize_function, batched=True)
77 | 


--------------------------------------------------------------------------------
/llmadmin/frontend/mongo_logger.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | from datetime import datetime, timezone
 3 | from typing import Any
 4 | 
 5 | # from gradio import FlaggingCallback
 6 | from pymongo import MongoClient
 7 | 
 8 | from llmadmin.common.constants import COLLECTION_NAME, DB_NAME
 9 | from llmadmin.common.llm_event import LlmEvent, LlmResponse, Vote
10 | 
11 | 
12 | # class MongoLogger(FlaggingCallback):
13 | #     """Logs flagged events to Mongo DB."""
14 | 
15 | #     def __init__(self, url, project_name) -> None:
16 | #         self.url = url
17 | #         self.client = MongoClient(url)
18 | #         self.project_name = project_name
19 | #         self.components = None
20 | #         try:
21 | #             self.client.admin.command("ping")
22 | #             print("Pinged MongoDB. Correctly set up")
23 | #         except Exception as e:
24 | #             print(e)
25 | 
26 | #     def setup(self, components):
27 | #         self.components = components
28 | #         # Check if the database exists
29 | #         if DB_NAME in self.client.list_database_names():
30 | #             self.db = self.client[DB_NAME]
31 | #             print(f"Database '{DB_NAME}' already exists.")
32 | #         else:
33 | #             # The database doesn't exist, so create it
34 | #             self.db = self.client[DB_NAME]
35 | #             print(f"Database '{DB_NAME}' created.")
36 | 
37 | #         # OK, now we create a collection.
38 | #         # Check if the collection exists
39 | #         if COLLECTION_NAME in self.db.list_collection_names():
40 | #             # The collection exists
41 | #             print(
42 | #                 f"Collection '{COLLECTION_NAME}' already exists in database '{DB_NAME}'."
43 | #             )
44 | #         else:
45 | #             # The collection doesn't exist, so create it
46 | #             self.db.create_collection(COLLECTION_NAME)
47 | #             print(f"Collection '{COLLECTION_NAME}' created in database '{DB_NAME}'.")
48 | 
49 | #     def flag(self, flag_data: list[Any], flag_option: str = "", username: str = ""):
50 | #         print(f"last value is: {flag_data}")
51 | #         event = LlmEvent(
52 | #             project_name=self.project_name,
53 | #             created_at=datetime.now(timezone.utc),
54 | #             instance_id=str(uuid.uuid4()),
55 | #             user_prompt=flag_data[0],
56 | #             # TODO(mwk): Work out how to generalize this to _n_ inputs
57 | #             responses=[
58 | #                 LlmResponse(
59 | #                     model_id=flag_data[1], text=flag_data[4], gen_stats=flag_data[8][0]
60 | #                 ),
61 | #                 LlmResponse(
62 | #                     model_id=flag_data[2], text=flag_data[5], gen_stats=flag_data[8][1]
63 | #                 ),
64 | #                 LlmResponse(
65 | #                     model_id=flag_data[3], text=flag_data[6], gen_stats=flag_data[8][2]
66 | #                 ),
67 | #             ],
68 | #             session_id=flag_data[9],
69 | #         )
70 | #         if flag_data[7]:
71 | #             vote_number = int(flag_data[7][-1])
72 | #             event.votes = Vote(llm=flag_data[vote_number], score=1)
73 | 
74 | #         print(f"Event is {event.json()}")
75 | #         result = self.client[DB_NAME][COLLECTION_NAME].insert_one(event.dict())
76 | #         print(f"Mongo result {result}")
77 | 


--------------------------------------------------------------------------------
/llmadmin/common/evaluation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | class GPT:
 7 |     """A simple wrapper around the OpenAI API for evaluating GPT models."""
 8 | 
 9 |     def __init__(self, model_version="gpt-4", temperature=0.9, max_tokens=2048):
10 |         api_key = os.getenv("GPT4_API_KEY")
11 |         assert api_key, "Please set the GPT4_API_KEY environment variable"
12 |         self.__api_key = os.getenv("GPT4_API_KEY")
13 |         self.temperature = temperature
14 |         self.max_tokens = max_tokens
15 |         self.model = model_version
16 | 
17 |     def evaluate_results(self, prompt, results):
18 |         """Evaluate a list of results generated by several models on a single prompt."""
19 |         for result in results:
20 |             result.pop("stats", None)
21 | 
22 |         gpt_messages = [
23 |             {
24 |                 "role": "system",
25 |                 "content": (
26 |                     """You are an assistant tasked with ranking responses in 
27 |                     order of quality, creating a leaderboard of all models.
28 |                     The best model has rank 1, the second best has rank 2, etc.
29 |                     You have to assess the quality of the responses, and rank them."""
30 |                 ),
31 |             },
32 |             {
33 |                 "role": "user",
34 |                 "content": (
35 |                     f"""You are given a prompt and a list of responses
36 |                     from several models in Python dictionary format. 
37 |                     Specifically, the format of the results is as follows:
38 |                     
39 |                     'model': <model-name>, 'result': <model-output>
40 |                     
41 |                     Your job is to "rank" the responses in order of quality, (not by
42 |                     the order in which they were generated).
43 |                     
44 |                     The prompt is: {prompt}
45 |                     The responses are: {results}
46 |                     
47 |                     Please rank the responses by quality, and return a list of the model
48 |                     names and ranks, i.e produce the following output:
49 |                     
50 |                     'model': <model-name>, 'rank': <model-rank>
51 |                     
52 |                     Only output this format, and nothing else. Your response must
53 |                     be a valid Python dictionary.
54 |                     Think step by step and give me this quality ranking.
55 |                     """
56 |                 ),
57 |             },
58 |         ]
59 |         return self.generate(gpt_messages)
60 | 
61 |     def generate(self, messages):
62 |         data = {
63 |             "model": self.model,
64 |             "messages": messages,
65 |             "max_tokens": self.max_tokens,
66 |             "temperature": self.temperature,
67 |         }
68 |         headers = {
69 |             "Content-Type": "application/json",
70 |             "Authorization": f"Bearer {self.__api_key}",
71 |         }
72 |         resp = requests.post(
73 |             url="https://api.openai.com/v1/chat/completions", json=data, headers=headers
74 |         )
75 | 
76 |         if not resp.ok:
77 |             raise RuntimeError(f"Failed to generate: {resp.reason}")
78 | 
79 |         return resp.json()["choices"][0]["message"]["content"]
80 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/processors.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union
 2 | 
 3 | import torch
 4 | from transformers import LogitsProcessor, StoppingCriteria
 5 | 
 6 | from llmadmin.backend.logger import get_logger
 7 | 
 8 | logger = get_logger(__name__)
 9 | 
10 | 
11 | class StopOnTokens(StoppingCriteria):
12 |     """
13 |     Stopping criteria to allow stopping on multi-token sequences.
14 | 
15 |     ``first_stopping_token_in_batch`` attribute can be used for postprocessing after
16 |     generation.
17 | 
18 |     Args:
19 |         stopping_sequences (List[Union[List[int], int]]): List of sequences to stop on.
20 |     """
21 | 
22 |     def __init__(self, stopping_sequences: List[Union[List[int], int]]) -> None:
23 |         self.stopping_sequences = stopping_sequences
24 |         self.stop_ids = [
25 |             torch.LongTensor([stop_id] if not isinstance(stop_id, list) else stop_id)
26 |             for stop_id in self.stopping_sequences
27 |         ]
28 |         self.first_stopping_token_in_batch = {}
29 | 
30 |     def __call__(
31 |         self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
32 |     ) -> bool:
33 |         for batch_index, batch in enumerate(input_ids):
34 |             if batch_index not in self.first_stopping_token_in_batch:
35 |                 for stop_id in self.stop_ids:
36 |                     if len(batch) > len(stop_id) and batch[-len(stop_id) :].equal(
37 |                         stop_id.to(batch.device)
38 |                     ):
39 |                         self.first_stopping_token_in_batch[batch_index] = len(batch) - 1
40 |                         break
41 |         return len(self.first_stopping_token_in_batch) == len(input_ids)
42 | 
43 | 
44 | class StopOnTokensLogitsProcessor(LogitsProcessor):
45 |     """
46 |     Processor to force only EOS token after encountering a stopping sequence.
47 | 
48 |     Args:
49 |         stopping_sequences (List[Union[List[int], int]]): List of sequences to stop on.
50 |         eos_token_id (Union[int, List[int]]): EOS token id(s).
51 |     """
52 | 
53 |     def __init__(
54 |         self,
55 |         stopping_sequences: List[Union[List[int], int]],
56 |         eos_token_id: Union[int, List[int]],
57 |     ) -> None:
58 |         if isinstance(eos_token_id, int):
59 |             eos_token_id = [eos_token_id]
60 |         self.eos_token_id = eos_token_id
61 |         self.stop_ids = [
62 |             torch.LongTensor([stop_id] if not isinstance(stop_id, list) else stop_id)
63 |             for stop_id in stopping_sequences
64 |         ]
65 |         self._stopped_batches = set()
66 |         self._nulled_batch = None
67 | 
68 |     def __call__(
69 |         self, input_ids: torch.LongTensor, scores: torch.FloatTensor
70 |     ) -> torch.FloatTensor:
71 |         for batch_index, batch in enumerate(input_ids):
72 |             if batch_index not in self._stopped_batches:
73 |                 for stop_id in self.stop_ids:
74 |                     if len(batch) > len(stop_id) and batch[-len(stop_id) :].equal(
75 |                         stop_id.to(batch.device)
76 |                     ):
77 |                         self._stopped_batches.add(batch_index)
78 |                         break
79 |             if batch_index in self._stopped_batches:
80 |                 if self._nulled_batch is None:
81 |                     scores[batch_index, :] = -float("inf")
82 |                     scores[batch_index, self.eos_token_id] = 0
83 |                     self._nulled_batch = scores[batch_index]
84 |                 scores[batch_index] = self._nulled_batch
85 |         return scores
86 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/initializers/llamacpp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
 3 | 
 4 | import torch
 5 | from huggingface_hub import hf_hub_download
 6 | 
 7 | from llmadmin.backend.logger import get_logger
 8 | 
 9 | from ._base import LLMInitializer
10 | 
11 | if TYPE_CHECKING:
12 |     from llama_cpp import Llama
13 | 
14 | logger = get_logger(__name__)
15 | 
16 | 
17 | class LlamaCppTokenizer:
18 |     """Thin wrapper around a llama_cpp model to provide a subset of the PreTrainedTokenizer interface"""
19 | 
20 |     def __init__(self, model: "Llama") -> None:
21 |         self.model = model
22 | 
23 |     def decode(self, tokens: Union[List[int], List[List[int]]], **kwargs) -> str:
24 |         if not tokens:
25 |             return tokens
26 |         if isinstance(tokens[0], int):
27 |             return self.model.detokenize(tokens).decode("utf-8")
28 |         return [self.decode(t) for t in tokens]
29 | 
30 |     def encode(self, text: Union[str, List[str], List[List[str]]], **kwargs) -> str:
31 |         if isinstance(text, str):
32 |             return self.model.tokenize(text.encode("utf-8"))
33 |         return [self.encode(t) for t in text]
34 | 
35 |     def batch_encode(self, text: Union[List[str], List[List[str]]], **kwargs) -> str:
36 |         return self.encode(text)
37 | 
38 |     def __call__(self, text: Union[str, List[str], List[List[str]]], **kwargs):
39 |         return self.encode(text, **kwargs)
40 | 
41 | 
42 | class LlamaCppInitializer(LLMInitializer):
43 |     """Initialize llama_cpp model and tokenizer.
44 | 
45 |     Args:
46 |         device (torch.device): Device to place model and tokenizer on.
47 |         world_size (int): Number of GPUs to use.
48 |         model_filename (str): Name of the model file to download from HuggingFace Hub.
49 |             This needs to be in the ``model_id`` repository (passed to ``self.load()``).
50 |         **model_init_kwargs: Keyword arguments to pass to the llama_cpp model init.
51 |     """
52 | 
53 |     def __init__(
54 |         self,
55 |         device: torch.device,
56 |         world_size: int,
57 |         model_filename: str,
58 |         **model_init_kwargs,
59 |     ):
60 |         super().__init__(
61 |             device=device,
62 |             world_size=world_size,
63 |         )
64 |         self.model_filename = model_filename
65 |         self.model_init_kwargs = model_init_kwargs
66 | 
67 |     def _get_model_init_kwargs(self) -> Dict[str, Any]:
68 |         return {
69 |             # We use a large integer to put all of the layers on GPU by default.
70 |             "n_gpu_layers": 0 if self.device.type == "cpu" else 10**6,
71 |             "seed": 0,
72 |             "verbose": False,
73 |             "n_threads": int(os.environ["OMP_NUM_THREADS"]),
74 |             **self.model_init_kwargs,
75 |         }
76 | 
77 |     def load_model(self, model_id: str) -> "Llama":
78 |         logger.info(f"LlamaCppInitializer downloading {model_id} : {self.model_filename}")
79 |         model_path = hf_hub_download(model_id, self.model_filename)
80 |         logger.info(f"LlamaCppInitializer Loading model {model_path}")
81 |         # Lazy import to avoid issues on CPU head node
82 |         from llama_cpp import Llama
83 | 
84 |         return Llama(
85 |             model_path=os.path.abspath(model_path),
86 |             **self._get_model_init_kwargs(),
87 |         )
88 | 
89 |     def load_tokenizer(self, tokenizer_name: str) -> None:
90 |         return None
91 | 
92 |     def postprocess(
93 |         self, model: "Llama", tokenizer: None
94 |     ) -> Tuple["Llama", LlamaCppTokenizer]:
95 |         return super().postprocess(model, LlamaCppTokenizer(model))


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/_base.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | from abc import abstractmethod
  3 | from typing import Any
  4 | from llmadmin.backend.server.models import DataConfig
  5 | from datasets import load_dataset
  6 | from datasets import load_metric
  7 | import transformers
  8 | from transformers import PreTrainedTokenizer, PreTrainedModel
  9 | from typing import Any, Dict
 10 | from llmadmin.backend.server.models import FTConfig
 11 | from llmadmin.backend.logger import get_logger
 12 | 
 13 | logger = get_logger(__name__)
 14 | 
 15 | class Task(abc.ABC): 
 16 |     AUTO_MODEL_CLASS: transformers.AutoModel = None
 17 | 
 18 |     # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
 19 |     # or a path to a custom `datasets` loading script.
 20 |     DATASET_PATH: str = None
 21 | 
 22 |     # The name of a subset within `DATASET_PATH`.
 23 |     DATASET_NAME: str = None
 24 | 
 25 |     # kwargs when build model with transformer's "from_pretrained"
 26 |     FROM_PRETRAINED_KWARGS: Dict[str, Any] = None
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         tokenizer: "PreTrainedTokenizer",
 31 |         ft_config: "FTConfig",
 32 |     ) -> None:
 33 |         self.tokenizer = tokenizer
 34 |         self.ft_config = ft_config
 35 |         self.download_dataset()
 36 |         self._pre()
 37 | 
 38 |     @classmethod
 39 |     def from_tokenizer(
 40 |         cls,
 41 |         tokenizer: "PreTrainedTokenizer",
 42 |         ft_config: "FTConfig",
 43 |     ) -> "Task":
 44 |         fac = cls(
 45 |             tokenizer = tokenizer,
 46 |             ft_config = ft_config
 47 |         )
 48 | 
 49 |         return fac
 50 | 
 51 |     @abstractmethod
 52 |     def get_data_proprocess(self) -> Any:
 53 |         """Change trainning data to tensor model can accepted"""
 54 |         pass
 55 | 
 56 |     @abstractmethod
 57 |     def get_compute_metrics(self) -> Any:
 58 |         pass
 59 |     
 60 |     @abstractmethod
 61 |     def get_data_collator(self) -> Any:
 62 |         pass
 63 | 
 64 |     def _pre(self) -> Any:
 65 |         pass
 66 | 
 67 |     @abstractmethod
 68 |     def training_key(self):
 69 |         """
 70 |         :return: Iterable[obj]
 71 |             A iterable of any object, that doc_to_text can handle
 72 |         """
 73 |         pass
 74 | 
 75 |     @abstractmethod
 76 |     def validation_key(self):
 77 |         """
 78 |         :return: Iterable[obj]
 79 |             A iterable of any object, that doc_to_text can handle
 80 |         """
 81 |         pass
 82 |     
 83 |     @abstractmethod
 84 |     def getTrainDataSet(self):
 85 |         pass
 86 | 
 87 |     @abstractmethod
 88 |     def getEvalDataSet(self):
 89 |         pass
 90 | 
 91 |     @abstractmethod
 92 |     def getSmallTrainDataSet(self, len: int):
 93 |         pass
 94 | 
 95 |     @abstractmethod
 96 |     def getSmallEvalDataSet(self, len: int):
 97 |         pass
 98 |    
 99 |     def get_dataset(self):
100 |         return self.dataset
101 | 
102 |     def download_dataset(self):
103 |         # Downloading and loading a dataset from the hub.
104 |         logger.info("Start loading dataset")
105 |         if self.ft_config.data_config.local_path:
106 |             logger.info(f"Loading dataset from local path {self.ft_config.data_config.local_path}")
107 |             raw_datasets = load_dataset(self.ft_config.data_config.local_path)
108 |         else:
109 |             if self.DATASET_NAME:
110 |                 logger.info(f"Downloading dataset {self.DATASET_NAME} from {self.DATASET_PATH}")
111 |                 raw_datasets = load_dataset(self.DATASET_PATH, self.DATASET_NAME)
112 |             else:
113 |                 logger.info(f"Downloading dataset from {self.DATASET_PATH}")
114 |                 raw_datasets = load_dataset(self.DATASET_PATH)
115 |         logger.info("Done load dataset")
116 |         logger.info(f"{raw_datasets}")
117 |         self.dataset = raw_datasets
118 | 
119 |     def set_model(self, model: PreTrainedModel):
120 |         self.model = model
121 | 
122 |     def get_model(self):
123 |         return self.model


--------------------------------------------------------------------------------
/llm_finetune_ray.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import subprocess
  4 | import ray
  5 | import ray.util.scheduling_strategies
  6 | 
  7 | 
  8 | def force_on_node(node_id: str, remote_func_or_actor_class):
  9 |     scheduling_strategy = ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
 10 |         node_id=node_id, soft=False
 11 |     )
 12 |     options = {"scheduling_strategy": scheduling_strategy}
 13 |     return remote_func_or_actor_class.options(**options)
 14 | 
 15 | 
 16 | def run_on_every_node(remote_func_or_actor_class, *remote_args, **remote_kwargs):
 17 |     refs = []
 18 |     for node in ray.nodes():
 19 |         if node["Alive"] and node["Resources"].get("GPU", None):
 20 |             refs.append(
 21 |                 force_on_node(node["NodeID"], remote_func_or_actor_class).remote(
 22 |                     *remote_args, **remote_kwargs
 23 |                 )
 24 |             )
 25 |     return ray.get(refs)
 26 | 
 27 | 
 28 | @ray.remote(num_gpus=1)
 29 | def mount_nvme():
 30 |     if os.path.exists("/nvme"):
 31 |         return
 32 |     subprocess.run(
 33 |         'drive_name="${1:-/dev/nvme1n1}"; mount_path="${2:-/nvme}"; set -x; sudo file -s "$drive_name"; sudo apt install xfsprogs -y; sudo mkfs -t xfs "$drive_name"; sudo mkdir "$mount_path" && sudo mount "$drive_name" "$mount_path" && sudo chown -R ray "$mount_path"',
 34 |         shell=True,
 35 |         check=True,
 36 |     )
 37 | 
 38 | 
 39 | @ray.remote(num_gpus=1)
 40 | def download_model(base_model_name=None):
 41 |     base_model_name = (
 42 |         base_model_name or "RWKV-4-Pile-1B5"
 43 |     )  # "RWKV-4-Pile-1B5", "RWKV-4-Pile-430M", "RWKV-4-Pile-169M"
 44 |     base_model_url = f"https://huggingface.co/BlinkDL/{base_model_name.lower()}"
 45 |     subprocess.run(
 46 |         f"cd /nvme; git lfs clone {base_model_url}; ls '{base_model_name.lower()}'",
 47 |         shell=True,
 48 |         check=True,
 49 |     )
 50 | 
 51 | 
 52 | @ray.remote(num_gpus=1)
 53 | def download_pile_remote(dataset_name):
 54 |     subprocess.run(
 55 |         "rm -rf /nvme/enwik8; rm -rf /nvme/data/pile/; rm -rf ~/gpt-neox",
 56 |         shell=True,
 57 |         check=True,
 58 |     )
 59 |     subprocess.run(
 60 |         "cd ~/; git clone https://github.com/Yard1/gpt-neox.git;", shell=True
 61 |     )
 62 |     subprocess.run(
 63 |         f"cd ~/; cd gpt-neox; echo 'starting dataset download {dataset_name}'; python prepare_data.py {dataset_name} -d /nvme/data/pile -t HFTokenizer --vocab-file '/mnt/cluster_storage/20B_tokenizer.json' && echo 'download complete'",
 64 |         shell=True,
 65 |         check=True,
 66 |     )
 67 | 
 68 | 
 69 | def download_pile(dataset_name):
 70 |     subprocess.run(
 71 |         # Necessary for gpt-neox tokenizer to work
 72 |         "pip uninstall -y deepspeed && pip install --user -U git+https://github.com/EleutherAI/DeeperSpeed.git@eb7f5cff36678625d23db8a8fe78b4a93e5d2c75#egg=deepspeed",
 73 |         shell=True,
 74 |     )
 75 |     try:
 76 |         run_on_every_node(download_pile_remote, dataset_name=dataset_name)
 77 |     finally:
 78 |         subprocess.run(
 79 |             # Use latest deepspeed for actual training. Will crash otherwise
 80 |             "pip uninstall -y deepspeed && pip install -U --user deepspeed",
 81 |             shell=True,
 82 |         )
 83 | 
 84 | 
 85 | @ray.remote(num_gpus=1)
 86 | def clean_cache():
 87 |     subprocess.run("rm -rf  ~/.cache/torch_extensions", shell=True, check=True)
 88 | 
 89 | 
 90 | @ray.remote(num_gpus=1)
 91 | def run(cmd: str):
 92 |     subprocess.run(cmd, shell=True, check=True)
 93 | 
 94 | 
 95 | if __name__ == "__main__":
 96 |     parser = argparse.ArgumentParser()
 97 | 
 98 |     parser.add_argument("function", type=str, help="function in this file to run")
 99 |     parser.add_argument("args", nargs="*", type=str, help="string args to function")
100 |     args = parser.parse_args()
101 | 
102 |     ray.init()
103 |     if args.function not in globals():
104 |         raise ValueError(f"{args.function} doesn't exist")
105 |     fn = globals()[args.function]
106 |     assert callable(fn) or hasattr(fn, "_function")
107 |     print(f"Running {args.function}({', '.join(args.args)})")
108 |     if hasattr(fn, "_function"):
109 |         run_on_every_node(fn, *args.args)
110 |     else:
111 |         fn(*args.args)
112 | 


--------------------------------------------------------------------------------
/llmadmin/backend/server/run.py:
--------------------------------------------------------------------------------
  1 | # import sys
  2 | from typing import Dict, List, Union
  3 | import ray
  4 | from llmadmin.backend.server.app import ApiServer
  5 | from llmadmin.backend.server.config import SERVE_RUN_HOST
  6 | from llmadmin.backend.server.models import FTApp
  7 | from llmadmin.backend.server.utils import parse_args, parse_args_ft
  8 | # import uuid
  9 | # import os
 10 | from llmadmin.backend.llm.ft import TransformersFT
 11 | from llmadmin.backend.llm.ft import RayTrain
 12 | from llmadmin.backend.logger import get_logger
 13 | from ray.serve._private.constants import DEFAULT_HTTP_PORT
 14 | from llmadmin.backend.server.utils import get_serve_port
 15 | from ray import serve
 16 | 
 17 | # ray.init(address="auto")
 18 | logger = get_logger(__name__)
 19 | 
 20 | def run_ray_ft(ft: Union[FTApp, str]):
 21 |     """Run the LLM Train on the local Ray Cluster
 22 | 
 23 |     Args:
 24 |         model: A LLMApp objects or paths to yaml files defining LLMApps
 25 | 
 26 |     Example:
 27 |        run("models/model.yaml") # run one model in the model directory
 28 |        run(FTApp)         # run a single LLMApp
 29 |     """
 30 | 
 31 |     ft = parse_args_ft(ft)
 32 |     if not ft:
 33 |         raise RuntimeError("No valiabled fine tune defination were found.")
 34 |     
 35 |     if isinstance(ft, FTApp):
 36 |         logger.info(f"Initialized a Finetune instance of FTApp {ft.json(indent=2)}")
 37 |     else:
 38 |         raise RuntimeError("Not a Finetune App were found.")
 39 |     
 40 |     # ray._private.usage.usage_lib.record_library_usage("llmadmin")
 41 | 
 42 |     runner = RayTrain(ft)
 43 |     runner.train()
 44 | 
 45 | def run_ft(ft: Union[FTApp, str]):
 46 |     """Run the LLM Server on the local Ray Cluster
 47 | 
 48 |     Args:
 49 |         model: A LLMApp objects or paths to yaml files defining LLMApps
 50 | 
 51 |     Example:
 52 |        run("models/model.yaml") # run one model in the model directory
 53 |        run(FTApp)         # run a single LLMApp
 54 |     """
 55 | 
 56 |     ft = parse_args_ft(ft)
 57 |     if not ft:
 58 |         raise RuntimeError("No valiabled fine tune defination were found.")
 59 |     
 60 |     if isinstance(ft, FTApp):
 61 |         logger.info(f"Initialized a Finetune instance of FTApp {ft.json(indent=2)}")
 62 |     else:
 63 |         raise RuntimeError("Not a Finetune App were found.")
 64 |     
 65 |     ray._private.usage.usage_lib.record_library_usage("llmadmin")
 66 | 
 67 |     runner = TransformersFT(ft)
 68 |     runner.train()
 69 | 
 70 | def start_apiserver(port: int = DEFAULT_HTTP_PORT, resource_config: str = None, scale_config: str = None):
 71 |     """Run the API Server on the local Ray Cluster
 72 | 
 73 |     Args:
 74 |         *host: The host ip to run.
 75 |         *port: The port to run.     
 76 | 
 77 |     """
 78 |     scale_dict = dict()
 79 |     try:
 80 |         scale_dict = toDict(scale_config)
 81 |     except:
 82 |         raise ValueError(f"Invalid value of scale config '{scale_config}'")
 83 |     resource_dict = None
 84 |     try:
 85 |         resource_dict = toDict(resource_config)
 86 |     except:
 87 |         raise ValueError(f"Invalid value of resource config '{resource_config}'")
 88 |     
 89 |     # ray._private.usage.usage_lib.record_library_usage("llmfinetune")
 90 |     # ray.init(address="auto")
 91 |     serve_start_port = get_serve_start_port(port)
 92 |     app = ApiServer.options(autoscaling_config=scale_dict, ray_actor_options=resource_dict).bind()
 93 |     serve.start(http_options={"host": SERVE_RUN_HOST, "port": serve_start_port})
 94 |     logger.info(f"Serve 'apiserver' is running at {SERVE_RUN_HOST}/{serve_start_port}")
 95 |     logger.info(f"Serve 'apiserver' run with resource: {resource_dict} , scale: {scale_dict}")
 96 |     serve.run(app, name="apiserver", route_prefix="/api")
 97 | 
 98 | # parse k1=v1,k2=v2 to dict
 99 | def toDict(kv: str) -> Dict:
100 |     if kv:
101 |         s = kv.replace(' ', ', ')
102 |         return eval(f"dict({s})")
103 |     else:
104 |         return dict()
105 | 
106 | def get_serve_start_port(port: int):
107 |     serve_start_port = port
108 |     serve_runtime_port = get_serve_port()
109 |     if serve_runtime_port > -1:
110 |         logger.info(
111 |             f"Serve is already running at {SERVE_RUN_HOST}:{serve_runtime_port}")
112 |         serve_start_port = serve_runtime_port
113 |     return serve_start_port
114 | 
115 | # if __name__ == "__main__":
116 | #     run_ft(*sys.argv[1:])
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # The build output should clearly not be checked in
  2 | .llm-ray/
  3 | *test-output.xml
  4 | /bazel-*
  5 | /python/ray/core
  6 | /python/ray/pickle5_files/
  7 | /python/ray/thirdparty_files/
  8 | /python/ray/pyarrow_files/
  9 | /python/ray/jars/
 10 | /python/ray/cpp/
 11 | /python/build
 12 | /python/dist
 13 | /python/python-driver-*
 14 | /python/ray/serve/generated
 15 | /thirdparty/pkg/
 16 | /build/java
 17 | .jar
 18 | /dashboard/client/build
 19 | finetune_models
 20 | 
 21 | # Files generated by flatc should be ignored
 22 | /src/ray/gcs/format/*_generated.h
 23 | /src/ray/object_manager/format/*_generated.h
 24 | /src/ray/raylet/format/*_generated.h
 25 | /java/runtime/src/main/java/io/ray/runtime/generated/*
 26 | /java/serve/src/main/java/io/ray/serve/generated/*
 27 | 
 28 | # Files genrated by c++ worker should be ignored.
 29 | /cpp/example/thirdparty/
 30 | /cpp/example/bazel-*
 31 | /python/ray/cpp
 32 | 
 33 | # Redis temporary files
 34 | *dump.rdb
 35 | 
 36 | # Python byte code files
 37 | *.pyc
 38 | python/.eggs
 39 | 
 40 | # Backup files
 41 | *.bak
 42 | 
 43 | # Emacs temporary files
 44 | *~
 45 | *#
 46 | 
 47 | # Compiled Object files
 48 | *.slo
 49 | *.lo
 50 | *.o
 51 | *.xo
 52 | *.obj
 53 | 
 54 | # Precompiled Headers
 55 | *.gch
 56 | *.pch
 57 | 
 58 | # Compiled Dynamic libraries
 59 | *.so
 60 | *.dylib
 61 | *.dll
 62 | python/ray/_raylet.pyd
 63 | 
 64 | # Incremental linking files
 65 | *.ilk
 66 | 
 67 | # Library export files
 68 | *.exp
 69 | 
 70 | # Debug symbols
 71 | *.pdb
 72 | 
 73 | # Fortran module files
 74 | *.mod
 75 | !deploy/ray-operator/go.mod
 76 | 
 77 | # Compiled Static libraries
 78 | *.lai
 79 | *.la
 80 | *.a
 81 | *.lib
 82 | 
 83 | # Executables
 84 | *.exe
 85 | *.out
 86 | *.app
 87 | 
 88 | # Visual Studio files
 89 | /packages
 90 | *.suo
 91 | *.user
 92 | *.VC.db
 93 | *.VC.opendb
 94 | 
 95 | # Protobuf-generated files
 96 | *_pb2.py
 97 | *.pb.h
 98 | *.pb.cc
 99 | 
100 | # Ray cluster configuration
101 | scripts/nodes.txt
102 | 
103 | # OS X folder attributes
104 | .DS_Store
105 | 
106 | # Debug files
107 | *.dSYM/
108 | *.su
109 | 
110 | # Python setup files
111 | *.egg-info
112 | 
113 | # Compressed files
114 | *.gz
115 | 
116 | # Datasets from examples
117 | **/MNIST_data/
118 | **/cifar-10-batches-bin/
119 | 
120 | # Generated documentation files
121 | /doc/_build
122 | /doc/source/_static/thumbs
123 | /doc/source/tune/generated_guides/
124 | /doc/source/**/doc/
125 | 
126 | # User-specific stuff:
127 | .idea/**/workspace.xml
128 | .idea/**/tasks.xml
129 | .idea/dictionaries
130 | .llvm-local.bazelrc
131 | 
132 | # Sensitive or high-churn files:
133 | .idea/**/dataSources/
134 | .idea/**/dataSources.ids
135 | .idea/**/dataSources.xml
136 | .idea/**/dataSources.local.xml
137 | .idea/**/sqlDataSources.xml
138 | .idea/**/dynamic.xml
139 | .idea/**/uiDesigner.xml
140 | 
141 | # Gradle:
142 | .idea/**/gradle.xml
143 | .idea/**/libraries
144 | .idea
145 | 
146 | # Website
147 | /site/Gemfile.lock
148 | /site/.sass-cache
149 | /site/_site
150 | 
151 | # Pytest Cache
152 | **/.pytest_cache
153 | **/.cache
154 | .benchmarks
155 | python-driver-*
156 | 
157 | # Vscode
158 | .vscode/
159 | 
160 | *.iml
161 | 
162 | # Java
163 | java/**/target
164 | java/**/lib
165 | java/**/.settings
166 | java/**/.classpath
167 | java/**/.project
168 | java/runtime/native_dependencies/
169 | java/testng_custom.xml
170 | 
171 | dependency-reduced-pom.xml
172 | 
173 | # Cpp
174 | cpp/example/thirdparty/
175 | 
176 | .clwb
177 | 
178 | # pom.xml files generated from pom_template.xml
179 | java/**/pom.xml
180 | 
181 | # python virtual env
182 | venv
183 | 
184 | # pyenv version file
185 | .python-version
186 | 
187 | # Vim
188 | .*.swp
189 | *.swp
190 | .*.swo
191 | *.swo
192 | tags
193 | tags.lock
194 | tags.temp
195 | *.vim
196 | 
197 | # Emacs
198 | .#*
199 | 
200 | # tools
201 | tools/prometheus*
202 | 
203 | # ray project files
204 | project-id
205 | .mypy_cache/
206 | 
207 | # release test related
208 | .anyscale.yaml
209 | test_state.json
210 | 
211 | # workflow storage
212 | workflow_data/
213 | 
214 | # vscode java extention generated
215 | .factorypath
216 | 
217 | # Jupyter Notebooks
218 | **/.ipynb_checkpoints/
219 | 
220 | /external
221 | # Compiled output -> don't check in
222 | /compile_commands.json
223 | # Directory where clangd puts its indexing work
224 | /.cache/
225 | 
226 | # Auto-generated tag mapping
227 | tag-mapping.json
228 | 
229 | .bazeliskrc
230 | 
231 | # ignore tmp files
232 | *.tmp
233 | deploy/anyscale/service.yaml
234 | out
235 | 
236 | # build output
237 | build/
238 | dist/
239 | 
240 | results/
241 | aviary-output.json
242 | evaluation-output.json
243 | prompts.txt
244 | hash
245 | __pycache__
246 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Union, Tuple
  2 | 
  3 | import torch
  4 | from transformers import PreTrainedTokenizer
  5 | 
  6 | from llmadmin.backend.server.models import Prompt
  7 | 
  8 | 
  9 | def tokenize_string(tokenizer: PreTrainedTokenizer, key: str) -> Union[int, List[int]]:
 10 |     """Tokenize a string using a tokenizer.
 11 | 
 12 |     Args:
 13 |         tokenizer (PreTrainedTokenizer): Tokenizer to use.
 14 |         key (str): String to tokenize.
 15 |     """
 16 |     token_ids = tokenizer.encode(key, add_special_tokens=False)
 17 |     return token_ids[0] if len(token_ids) == 1 else token_ids
 18 | 
 19 | 
 20 | def decode_tokens(tokenizer: PreTrainedTokenizer, tokens: Union[int, List[int]]) -> str:
 21 |     tokens = tokens if isinstance(tokens, list) else [tokens]
 22 |     text = tokenizer.decode(tokens)
 23 |     return text
 24 | 
 25 | 
 26 | def truncate_to_first_stop_token(
 27 |     tokens: torch.LongTensor,
 28 |     stop_ids: List[Union[int, List[int]]],
 29 | ) -> torch.LongTensor:
 30 |     """Truncate tokens up to the first stop_id.
 31 | 
 32 |     Args:
 33 |         tokens (torch.LongTensor): Tokens to truncate.
 34 |         stop_ids (List[Union[int, List[int]]]): Stop ids to truncate at. Can be
 35 |             composed of single stop ids or sequences of ids.
 36 |     """
 37 |     if not stop_ids:
 38 |         return tokens
 39 |     stop_ids: List[torch.LongTensor] = [
 40 |         torch.LongTensor([stop_id] if not isinstance(stop_id, list) else stop_id)
 41 |         for stop_id in stop_ids
 42 |     ]
 43 |     for i in range(len(tokens)):
 44 |         for stop_id_index, _ in enumerate(stop_ids):
 45 |             stop_id = stop_ids[stop_id_index].to(tokens.device)
 46 |             if len(tokens) - i >= len(stop_id) and tokens[i : len(stop_id) + i].equal(
 47 |                 stop_id
 48 |             ):
 49 |                 return tokens[:i]
 50 |     return tokens
 51 | 
 52 | 
 53 | 
 54 | def _construct_prompt(prompt: Union[str, Prompt], prompt_format: str) -> str:
 55 |     if isinstance(prompt, Prompt):
 56 |         if prompt.use_prompt_format and prompt_format:
 57 |             return prompt_format.format(instruction=prompt.prompt)
 58 |         else:
 59 |             return prompt.prompt
 60 |     return prompt_format.format(instruction=prompt) if prompt_format else prompt
 61 | 
 62 | def construct_prompts(
 63 |     prompts: Union[str, Prompt, List[str], List[Prompt], Tuple[str]],
 64 |     prompt_format: str,
 65 | ) -> List[str]:
 66 |     """Construct prompts from a prompt string or list of prompts."""
 67 |     if not isinstance(prompts, list):
 68 |         prompts = [prompts]
 69 |     return [_construct_prompt(prompt, prompt_format) for prompt in prompts]
 70 | 
 71 | def construct_prompts_experimental(
 72 |     prompts: Union[str, Prompt, List[str], List[Prompt], Tuple[str]],
 73 |     prompt_format: str,
 74 | ) -> List[str]:
 75 |     """Construct prompts from a prompt string or list of prompts."""
 76 |     if not isinstance(prompts, list):
 77 |         prompts = [prompts]
 78 |     
 79 |     params = []
 80 |     for prompt in prompts:
 81 |         if isinstance(prompt, Prompt) and isinstance(prompt.prompt, Tuple):
 82 |             params += [_construct_prompt(prompt, prompt_format) for prompt in prompt.prompt]
 83 |         else:
 84 |             params.append(_construct_prompt(prompt, prompt_format)) 
 85 |     return params
 86 | 
 87 | 
 88 | def tokenize_stopping_sequences_where_needed(
 89 |     tokenizer: PreTrainedTokenizer,
 90 |     stopping_sequences: List[Union[str, int, List[int]]],
 91 | ) -> List[Union[List[int], int]]:
 92 |     """If any sequence is a string, tokenize it.
 93 | 
 94 |     Args:
 95 |         tokenizer (PreTrainedTokenizer): Tokenizer to use.
 96 |         stopping_sequences (List[Union[str, int, List[int]]]): Stopping sequences to
 97 |             tokenize. Can be ids, sequences of ids or strings.
 98 |     """
 99 |     if not stopping_sequences:
100 |         return None
101 |     return [
102 |         tokenize_string(tokenizer, sequence) if isinstance(sequence, str) else sequence
103 |         for sequence in stopping_sequences
104 |     ]
105 | 
106 | 
107 | def decode_stopping_sequences_where_needed(
108 |     tokenizer: PreTrainedTokenizer,
109 |     stopping_sequences: List[Union[str, int, List[int]]],
110 | ) -> List[str]:
111 |     """If any sequence is a string, tokenize it."""
112 |     if not stopping_sequences:
113 |         return None
114 |     return [
115 |         decode_tokens(tokenizer, sequence)
116 |         if not isinstance(sequence, str)
117 |         else sequence
118 |         for sequence in stopping_sequences
119 |     ]
120 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/maskedlm_imdb.py:
--------------------------------------------------------------------------------
  1 | from ._base import Task
  2 | from transformers import AutoModelForMaskedLM
  3 | from typing import Any
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | 
  8 | 
  9 | class MaskedLMImdb(Task):
 10 |     AUTO_MODEL_CLASS = AutoModelForMaskedLM
 11 | 
 12 |     DATASET_PATH = "imdb"
 13 | 
 14 |     def get_data_proprocess(self) -> Any:
 15 |         tokenizer = self.tokenizer
 16 | 
 17 |         def group_texts(examples):
 18 |             # Concatenate all texts
 19 |             concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
 20 |             # Compute length of concatenated texts
 21 |             total_length = len(concatenated_examples[list(examples.keys())[0]])
 22 |             # We drop the last chunk if it's smaller than chunk_size
 23 |             total_length = (total_length // chunk_size) * chunk_size
 24 |             # Split by chunks of max_len
 25 |             result = {
 26 |                 k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
 27 |                 for k, t in concatenated_examples.items()
 28 |             }
 29 |             # Create a new labels column
 30 |             result["labels"] = result["input_ids"].copy()
 31 |             return result
 32 | 
 33 |         
 34 |         chunk_size = 128
 35 |         # adopt python decorator TODO
 36 |         def preprocess_function(examples: pd.DataFrame):            
 37 |             # examples = examples.to_dict("list")
 38 |             result = tokenizer(examples["text"])
 39 |             if tokenizer.is_fast:
 40 |                 result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
 41 | 
 42 |             tokenized_inputs = group_texts(result)
 43 |                 
 44 |             # Add back the original columns
 45 |             ret = {**tokenized_inputs}
 46 |             return pd.DataFrame.from_dict(ret)
 47 |         
 48 |         return preprocess_function
 49 | 
 50 |     def get_data_collator(self) -> Any:
 51 |         import collections
 52 |         import numpy as np
 53 |         from transformers import default_data_collator
 54 | 
 55 |         wwm_probability = 0.2
 56 |         tokenizer = self.tokenizer
 57 |         def whole_word_masking_data_collator(features):
 58 |             for feature in features:
 59 |                 word_ids = feature.pop("word_ids")
 60 | 
 61 |                 # Create a map between words and corresponding token indices
 62 |                 mapping = collections.defaultdict(list)
 63 |                 current_word_index = -1
 64 |                 current_word = None
 65 |                 for idx, word_id in enumerate(word_ids):
 66 |                     if word_id is not None:
 67 |                         if word_id != current_word:
 68 |                             current_word = word_id
 69 |                             current_word_index += 1
 70 |                         mapping[current_word_index].append(idx)
 71 | 
 72 |                 # Randomly mask words
 73 |                 mask = np.random.binomial(1, wwm_probability, (len(mapping),))
 74 |                 input_ids = feature["input_ids"]
 75 |                 labels = feature["labels"]
 76 |                 new_labels = [-100] * len(labels)
 77 |                 for word_id in np.where(mask)[0]:
 78 |                     word_id = word_id.item()
 79 |                     for idx in mapping[word_id]:
 80 |                         new_labels[idx] = labels[idx]
 81 |                         input_ids[idx] = tokenizer.mask_token_id
 82 |                 feature["labels"] = new_labels
 83 | 
 84 |             return default_data_collator(features)
 85 |         
 86 |         return whole_word_masking_data_collator
 87 | 
 88 |     def get_compute_metrics(self) -> Any:        
 89 |         return None
 90 | 
 91 |     def training_key(self):
 92 |         """
 93 |         :return: Iterable[obj]
 94 |             A iterable of any object, that doc_to_text can handle
 95 |         """
 96 |         return "train"
 97 | 
 98 |     def validation_key(self):
 99 |         """
100 |         :return: Iterable[obj]
101 |             A iterable of any object, that doc_to_text can handle
102 |         """
103 |         return "test"
104 | 
105 |     def getTrainDataSet(self):
106 |         return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True)
107 | 
108 |     def getEvalDataSet(self):
109 |         return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True)
110 | 
111 |     def getSmallTrainDataSet(self, len: int):
112 |         return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
113 | 
114 |     def getSmallEvalDataSet(self, len: int):
115 |         return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)


--------------------------------------------------------------------------------
/llmadmin/frontend/leaderboard.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from pymongo import DESCENDING, MongoClient
  3 | 
  4 | from llmadmin.common.constants import COLLECTION_NAME, DB_NAME, G5_COST_PER_S_IN_DOLLARS
  5 | 
  6 | 
  7 | class Leaderboard:
  8 |     def __init__(self, url: str, project_name: str):
  9 |         self.url = url
 10 |         self.client = MongoClient(url)
 11 |         self.db = self.client[DB_NAME]
 12 |         self.coll = self.db[COLLECTION_NAME]
 13 |         self.project_name = project_name
 14 | 
 15 |     def generate_votes_leaderboard(self) -> pd.DataFrame:
 16 |         pipeline_votes = [
 17 |             {"$match": {"votes": {"$ne": None}}},
 18 |             {
 19 |                 "$group": {
 20 |                     "_id": {"llm": "$votes.llm"},
 21 |                     "Votes": {"$sum": "$votes.score"},
 22 |                 }
 23 |             },
 24 |             {"$sort": {"count": DESCENDING}},
 25 |             {
 26 |                 "$project": {
 27 |                     "LLM": "$_id.llm",
 28 |                     "_id": 0,
 29 |                     "Votes": 1,
 30 |                 }
 31 |             },
 32 |         ]
 33 | 
 34 |         pipeline_contentions = [
 35 |             {"$match": {"votes": {"$ne": None}}},
 36 |             {"$unwind": {"path": "$responses"}},
 37 |             {
 38 |                 "$group": {
 39 |                     "_id": {"llm": "$responses.model_id"},
 40 |                     "In Contention": {"$sum": 1.0},
 41 |                 }
 42 |             },
 43 |             {
 44 |                 "$project": {
 45 |                     "LLM": "$_id.llm",
 46 |                     "_id": 0,
 47 |                     "In Contention": 1,
 48 |                 }
 49 |             },
 50 |         ]
 51 | 
 52 |         df_contentions = pd.DataFrame(
 53 |             list(self.coll.aggregate(pipeline_contentions)),
 54 |             columns=["LLM", "In Contention"],
 55 |         )
 56 |         df_votes = pd.DataFrame(
 57 |             list(self.coll.aggregate(pipeline_votes)), columns=["LLM", "Votes"]
 58 |         )
 59 |         df = pd.merge(df_votes, df_contentions, on="LLM", how="right").fillna(0)
 60 |         # Use m-estimate correction with prior of 1/3
 61 |         df["Win Ratio"] = (df["Votes"] + 1) / (df["In Contention"] + 3) * 3 * 1000
 62 |         df["Win Ratio"] = df["Win Ratio"].astype(int)
 63 |         df = df.sort_values(by="Win Ratio", ascending=False)
 64 |         return df
 65 | 
 66 |     def generate_perf_leaderboard(self) -> pd.DataFrame:
 67 |         pipeline = [
 68 |             {"$match": {"votes": {"$ne": None}}},
 69 |             {"$unwind": {"path": "$responses"}},
 70 |             {"$match": {"responses": {"$ne": None}}},
 71 |             {
 72 |                 "$group": {
 73 |                     "_id": {"llm": "$responses.model_id"},
 74 |                     "avg_latency": {"$avg": "$responses.gen_stats.total_time"},
 75 |                     "avg_length": {"$avg": "$responses.gen_stats.num_total_tokens"},
 76 |                 }
 77 |             },
 78 |             {
 79 |                 "$project": {
 80 |                     "LLM": "$_id.llm",
 81 |                     "_id": 0,
 82 |                     "Lat (s)": "$avg_latency",
 83 |                     "Tokens (i/o)": "$avg_length",
 84 |                 }
 85 |             },
 86 |         ]
 87 | 
 88 |         df = pd.DataFrame(
 89 |             list(self.coll.aggregate(pipeline)),
 90 |             columns=["LLM", "Lat (s)", "Tokens (i/o)"],
 91 |         )
 92 |         print(f"Raw DF \n{df}")
 93 |         df["Tokens/s"] = df["Tokens (i/o)"] / df["Lat (s)"]
 94 |         df["Cost per answer"] = df["Lat (s)"] * G5_COST_PER_S_IN_DOLLARS
 95 |         df["CP 1k tokens $"] = 1000 / df["Tokens/s"] * G5_COST_PER_S_IN_DOLLARS
 96 |         df = df.sort_values(by="Tokens/s", ascending=False)
 97 |         df = df.round(
 98 |             {
 99 |                 "Lat (s)": 1,
100 |                 "Tokens (i/o)": 1,
101 |                 "Tokens/s": 1,
102 |                 "Cost per answer": 4,
103 |                 "CP 1k tokens $": 4,
104 |             }
105 |         )
106 |         print(df)
107 |         return df
108 | 
109 | 
110 | class DummyLeaderboard(Leaderboard):
111 |     def __init__(self, url: str = None, project_name: str = None):
112 |         pass
113 | 
114 |     def generate_votes_leaderboard(self) -> pd.DataFrame:
115 |         return pd.DataFrame(
116 |             columns=["LLM", "In Contention", "Win Ratio"],
117 |         )
118 | 
119 |     def generate_perf_leaderboard(self) -> pd.DataFrame:
120 |         return pd.DataFrame(
121 |             columns=[
122 |                 "LLM",
123 |                 "Lat (s)",
124 |                 "Tokens (i/o)",
125 |                 "Tokens/s",
126 |                 "Cost per answer",
127 |                 "CP 1k tokens $",
128 |             ]
129 |         )
130 | 


--------------------------------------------------------------------------------
/llmadmin/api/sdk.py:
--------------------------------------------------------------------------------
  1 | # from typing import Any, Dict, List
  2 | from llmadmin.api.env import assert_has_backend
  3 | from ray.serve._private.constants import DEFAULT_HTTP_PORT
  4 | from llmadmin.backend.server import run
  5 | 
  6 | 
  7 | # __all__ = ["models", "metadata", "run"]
  8 | 
  9 | def start_apiserver(port: int = DEFAULT_HTTP_PORT, resource_config: str = None, scale_config: str = None) -> None:
 10 |     """Run Api server on the local ray cluster
 11 | 
 12 |     NOTE: This only works if you are running this command
 13 |     on the Ray or Anyscale cluster directly. It does not
 14 |     work from a general machine which only has the url and token
 15 |     for a model.
 16 |     """
 17 |     assert_has_backend()
 18 |     run.start_apiserver(port=port, resource_config=resource_config, scale_config=scale_config)
 19 | 
 20 | def run_ft(ft: str) -> None:
 21 |     """Run LLMAdmin on the local ray cluster
 22 | 
 23 |     NOTE: This only works if you are running this command
 24 |     on the Ray or Anyscale cluster directly. It does not
 25 |     work from a general machine which only has the url and token
 26 |     for a model.
 27 |     """
 28 |     assert_has_backend()
 29 |     run.run_ft(ft)
 30 |     
 31 | def run_ray_ft(ft: str) -> None:
 32 |     """Run LLMAdmin on the local ray cluster
 33 | 
 34 |     NOTE: This only works if you are running this command
 35 |     on the Ray or Anyscale cluster directly. It does not
 36 |     work from a general machine which only has the url and token
 37 |     for a model.
 38 |     """
 39 |     assert_has_backend()
 40 |     run.run_ray_ft(ft)
 41 | 
 42 | # def models() -> List[str]:
 43 | #     """List available models"""
 44 | #     from llmadmin.common.backend import get_llmadmin_backend
 45 | 
 46 | #     backend = get_llmadmin_backend()
 47 | #     return backend.models()
 48 | 
 49 | # def _is_llmadmin_model(model: str) -> bool:
 50 | #     """
 51 | #     Determine if this is an llmadmin model. LLMAdmin
 52 | #     models do not have a '://' in them.
 53 | #     """
 54 | #     return "://" not in model
 55 | 
 56 | # def _supports_batching(model: str) -> bool:
 57 | #     provider, _ = model.split("://", 1)
 58 | #     return provider != "openai"
 59 | 
 60 | # def _convert_to_llmadmin_format(model: str, llm_result):
 61 | #     generation = llm_result.generations
 62 | #     result_list = [{"generated_text": x.text} for x in generation[0]]
 63 | #     return result_list
 64 | 
 65 | # def metadata(model_id: str) -> Dict[str, Dict[str, Any]]:
 66 | #     """Get model metadata"""
 67 | #     from llmadmin.common.backend import get_llmadmin_backend
 68 | 
 69 | #     backend = get_llmadmin_backend()
 70 | #     return backend.metadata(model_id)
 71 | 
 72 | # def run(*model: str) -> None:
 73 | #     """Run LLMAdmin on the local ray cluster
 74 | 
 75 | #     NOTE: This only works if you are running this command
 76 | #     on the Ray or Anyscale cluster directly. It does not
 77 | #     work from a general machine which only has the url and token
 78 | #     for a model.
 79 | #     """
 80 | #     assert_has_backend()
 81 | #     from llmadmin.backend.server.run import run
 82 | #     run(*model)
 83 | 
 84 | # def run_experimental(*model: str) -> None:
 85 | #     """Run LLMAdmin on the local ray cluster
 86 | 
 87 | #     NOTE: This only works if you are running this command
 88 | #     on the Ray or Anyscale cluster directly. It does not
 89 | #     work from a general machine which only has the url and token
 90 | #     for a model.
 91 | #     """
 92 | #     assert_has_backend()
 93 | #     from llmadmin.backend.server.run import run_experimental
 94 | 
 95 | #     run_experimental(*model)
 96 | 
 97 | # def del_experimental(app_name: str) -> None:
 98 | #     """Delete ray serve on the local ray cluster
 99 | 
100 | #     NOTE: This only works if you are running this command
101 | #     on the Ray or Anyscale cluster directly. It does not
102 | #     work from a general machine which only has the url and token
103 | #     for a model.
104 | #     """
105 | #     assert_has_backend()
106 | #     from llmadmin.backend.server.run import del_experimental
107 | 
108 | #     del_experimental(app_name)
109 |     
110 | # def run_application(flow: dict) -> None:
111 | #     """Run LLMAdmin on the local ray cluster
112 | 
113 | #     NOTE: This only works if you are running this command
114 | #     on the Ray or Anyscale cluster directly. It does not
115 | #     work from a general machine which only has the url and token
116 | #     for a model.
117 | #     """
118 | #     assert_has_backend()
119 | #     from llmadmin.backend.server.run import run_application
120 | 
121 | #     run_application(flow)
122 | 
123 | 
124 | # def run_comparation() -> None:
125 | #     """Run LLMAdmin on the local ray cluster
126 | 
127 | #     NOTE: This only works if you are running this command
128 | #     on the Ray or Anyscale cluster directly. It does not
129 | #     work from a general machine which only has the url and token
130 | #     for a model.
131 | #     """
132 | #     assert_has_backend()
133 | #     from llmadmin.backend.server.run import run_comparation
134 | 
135 | #     run_comparation()
136 | 
137 |     
138 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/tokenclassification_conll2003.py:
--------------------------------------------------------------------------------
  1 | from ._base import Task
  2 | from transformers import AutoModelForTokenClassification
  3 | from typing import Any
  4 | import pandas as pd
  5 | import evaluate
  6 | import numpy as np
  7 | from transformers import DataCollatorForTokenClassification
  8 | 
  9 | class TokenclassificationConll2003(Task):
 10 |     AUTO_MODEL_CLASS = AutoModelForTokenClassification
 11 | 
 12 |     DATASET_PATH = "conll2003"
 13 |     FROM_PRETRAINED_KWARGS = {
 14 |         "num_labels": 9
 15 |     }
 16 | 
 17 |     def _pre(self) -> Any:
 18 |         label_names = self.get_dataset()[self.training_key()].features["ner_tags"].feature.names
 19 |         id2label = {i: label for i, label in enumerate(label_names)}
 20 |         label2id = {v: k for k, v in id2label.items()}
 21 |         self.FROM_PRETRAINED_KWARGS["id2label"] = id2label
 22 |         self.FROM_PRETRAINED_KWARGS["label2id"] = label2id
 23 | 
 24 |     def get_data_proprocess(self) -> Any:
 25 |         tokenizer = self.tokenizer
 26 |         def align_labels_with_tokens(labels, word_ids):
 27 |             new_labels = []
 28 |             current_word = None
 29 |             for word_id in word_ids:
 30 |                 if word_id != current_word:
 31 |                     # Start of a new word!
 32 |                     current_word = word_id
 33 |                     label = -100 if word_id is None else labels[word_id]
 34 |                     new_labels.append(label)
 35 |                 elif word_id is None:
 36 |                     # Special token
 37 |                     new_labels.append(-100)
 38 |                 else:
 39 |                     # Same word as previous token
 40 |                     label = labels[word_id]
 41 |                     # If the label is B-XXX we change it to I-XXX
 42 |                     if label % 2 == 1:
 43 |                         label += 1
 44 |                     new_labels.append(label)
 45 | 
 46 |             return new_labels
 47 |         
 48 |         # adopt python decorator TODO
 49 |         def preprocess_function(examples: pd.DataFrame):            
 50 |             # examples = examples.to_dict("list")
 51 |             # inputs = [i.tolist() for i in examples["tokens"]]
 52 |             inputs = [i for i in examples["tokens"]]
 53 |             tokenized_inputs = tokenizer(
 54 |                 inputs, truncation=True, is_split_into_words=True
 55 |             )
 56 |             all_labels = examples["ner_tags"]
 57 |             new_labels = []
 58 |             for i, labels in enumerate(all_labels):
 59 |                 word_ids = tokenized_inputs.word_ids(i)
 60 |                 new_labels.append(align_labels_with_tokens(labels, word_ids))
 61 | 
 62 |             tokenized_inputs["labels"] = new_labels
 63 |                 
 64 |             # Add back the original columns
 65 |             ret = {**examples, **tokenized_inputs}
 66 |             return pd.DataFrame.from_dict(ret)
 67 |         
 68 |         return preprocess_function
 69 | 
 70 |     def get_data_collator(self) -> Any:
 71 |         data_collator = DataCollatorForTokenClassification(tokenizer=self.tokenizer)
 72 |         return data_collator
 73 | 
 74 |     def get_compute_metrics(self) -> Any:
 75 |         label_names = self.get_dataset()[self.training_key()].features["ner_tags"].feature.names
 76 |         metric = evaluate.load("seqeval")
 77 | 
 78 |         def compute_metrics(eval_preds):
 79 |             logits, labels = eval_preds
 80 |             predictions = np.argmax(logits, axis=-1)
 81 | 
 82 |             # Remove ignored index (special tokens) and convert to labels
 83 |             true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
 84 |             true_predictions = [
 85 |                 [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
 86 |                 for prediction, label in zip(predictions, labels)
 87 |             ]
 88 |             all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
 89 |             return {
 90 |                 "precision": all_metrics["overall_precision"],
 91 |                 "recall": all_metrics["overall_recall"],
 92 |                 "f1": all_metrics["overall_f1"],
 93 |                 "accuracy": all_metrics["overall_accuracy"],
 94 |             }
 95 |         
 96 |         return compute_metrics
 97 | 
 98 |     def training_key(self):
 99 |         """
100 |         :return: Iterable[obj]
101 |             A iterable of any object, that doc_to_text can handle
102 |         """
103 |         return "train"
104 | 
105 |     def validation_key(self):
106 |         """
107 |         :return: Iterable[obj]
108 |             A iterable of any object, that doc_to_text can handle
109 |         """
110 |         return "validation"
111 |     
112 |     def getTrainDataSet(self):
113 |         return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True)
114 | 
115 |     def getEvalDataSet(self):
116 |         return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True)
117 | 
118 |     def getSmallTrainDataSet(self, len: int):
119 |         return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
120 | 
121 |     def getSmallEvalDataSet(self, len: int):
122 |         return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
123 | 


--------------------------------------------------------------------------------
/llmadmin/backend/server/config.py:
--------------------------------------------------------------------------------
  1 | from llmadmin.backend.server.models import LLMApp
  2 | 
  3 | 
  4 | LLMTEMPLATE_DEPLOYMENT_CONFIG = {
  5 |     "autoscaling_config":{
  6 |         "min_replicas": 0,
  7 |         "initial_replicas": 1,
  8 |         "max_replicas": 8,
  9 |         "target_num_ongoing_requests_per_replica": 1.0,
 10 |         "metrics_interval_s": 10.0,
 11 |         "look_back_period_s": 30.0,
 12 |         "smoothing_factor": 1.0,
 13 |         "downscale_delay_s": 300.0,
 14 |         "upscale_delay_s": 90.0,
 15 |     },
 16 |     "ray_actor_options": {
 17 |         "num_cpus": 0.1  
 18 |     }
 19 | }
 20 | LLMTEMPLATE_MODEL_CONFIG_COMPARATION = {
 21 |     "warmup": True,
 22 |     "model_task": "text-generation",
 23 |     "model_id": "template",
 24 |     "max_input_words": 800,
 25 |     "initialization": {
 26 |         "runtime_env": {
 27 |             "pip": ["deepspeed==0.9.2","accelerate"]
 28 |         },
 29 |         "initializer":{
 30 |             "type": "SingleDevice",
 31 |             "dtype": "float32",
 32 |             "from_pretrained_kwargs":{
 33 |                 "use_cache": True ,
 34 |                 "trust_remote_code": True
 35 |             }
 36 |             
 37 |         },
 38 |         "pipeline": "default"
 39 |     },
 40 |     "generation":{
 41 |         "max_batch_size": 18,
 42 |         "generate_kwargs":{
 43 |             "do_sample": True,
 44 |             "max_new_tokens": 128,
 45 |             "min_new_tokens": 16,
 46 |             "temperature": 0.7,
 47 |             "repetition_penalty": 1.1,
 48 |             "top_p": 0.8,
 49 |             "top_k": 50,
 50 |         },             
 51 |         "prompt_format": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\n{instruction}\n### Response:\n",
 52 |         "stopping_sequences": ["### Response:", "### End"]
 53 |     }                            
 54 | }
 55 | 
 56 | # TODO defaulttransformers leverage transformer pipeline to load the model, it's a problem, since some model cannot load by pipeline
 57 | LLMTEMPLATE_MODEL_CONFIG_EXPERIMENTAL = {
 58 |     "warmup": True,
 59 |     "model_task": "text-generation",
 60 |     "model_id": "template",
 61 |     "max_input_words": 800,
 62 |     "initialization": {
 63 |         "runtime_env": {
 64 |             "pip": ["deepspeed==0.9.2","accelerate"]
 65 |         },
 66 |         "initializer":{
 67 |             "type": "TransformersPipeline",
 68 |             "dtype": "float32",
 69 |             "use_fast": False,
 70 |             "from_pretrained_kwargs":{
 71 |                 "use_cache": True ,
 72 |                 "trust_remote_code": True
 73 |             }
 74 |             
 75 |         },
 76 |         "pipeline": "defaulttransformers"
 77 |     },
 78 |     "generation":{
 79 |         "max_batch_size": 18,
 80 |         "generate_kwargs":{
 81 |             "do_sample": True,
 82 |             "max_new_tokens": 128,
 83 |             "min_new_tokens": 16,
 84 |             "temperature": 0.7,
 85 |             "repetition_penalty": 1.1,
 86 |             "top_p": 0.8,
 87 |             "top_k": 50,
 88 |         },             
 89 |         "prompt_format": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\n{instruction}\n### Response:\n",
 90 |         "stopping_sequences": ["### Response:", "### End"]
 91 |     }                            
 92 | }
 93 | 
 94 | 
 95 | LLMTEMPLATE_SCALE_CONFIG = {
 96 |     "num_workers":1, 
 97 |     "num_gpus_per_worker":0.0, 
 98 |     "num_cpus_per_worker":1.0, 
 99 |     "placement_strategy":'PACK', 
100 |     "resources_per_worker":None, 
101 |     "pg_timeout_s":600
102 | }
103 | EXPERIMENTAL_LLMTEMPLATE = LLMApp(scaling_config=LLMTEMPLATE_SCALE_CONFIG.copy(),model_config=LLMTEMPLATE_MODEL_CONFIG_EXPERIMENTAL.copy())
104 | EXPERIMENTAL_LLMTEMPLATE.deployment_config = LLMTEMPLATE_DEPLOYMENT_CONFIG.copy()
105 | 
106 | COMPARATION_LLMTEMPLATE = LLMApp(scaling_config=LLMTEMPLATE_SCALE_CONFIG.copy(),model_config=LLMTEMPLATE_MODEL_CONFIG_COMPARATION.copy())
107 | COMPARATION_LLMTEMPLATE.deployment_config = LLMTEMPLATE_DEPLOYMENT_CONFIG.copy()
108 | 
109 | RAY_AGENT_ADDRESS = "http://localhost:52365"
110 | 
111 | MODELS_MAPPING = {
112 |     "gpt2": "./models/text-generation--gpt2.yaml",
113 |     "t5-small": "./models/translation--t5-small.yaml",
114 |     "THUDM/chatglm2-6b": "./models/text-generation--THUDM-chatglm2-6b.yaml",
115 |     "THUDM/chatglm-6b": "./models/text-generation--THUDM-chatglm-6b.yaml",
116 |     "Qwen/Qwen-7B": "./models/text-generation--Qwen--Qwen-7B.yaml",
117 |     "Qwen/Qwen-7B-Chat": "./models/text-generation--Qwen--Qwen-7B-Chat.yaml",
118 |     "LinkSoul/Chinese-Llama-2-7b": "./models/text-generation--LinkSoul--Chinese-Llama-2-7b.yaml",
119 |     "bigscience/bloom-560m": "./models/text-generation--bigscience--bloom-560m.yaml",
120 |     "baichuan-inc/Baichuan-7B": "./models/text-generation--baichuan-inc--Baichuan-7B.yaml",
121 |     "distilbert-base-uncased-finetuned-sst-2-english": "./models/text-classification--distilbert-base-uncased-finetuned-sst-2-english.yaml",
122 |     "facebook/bart-large-cnn": "./models/summarization--facebook--bart-large-cnn.yaml",
123 |     "deepset/roberta-base-squad2": "./models/question-answering--deepset--roberta-base-squad2.yaml",
124 |     "nlpconnect/vit-gpt2-image-captioning": "./models/image-to-text--nlpconnect--vit-gpt2-image-captioning.yaml"
125 | }
126 | 
127 | URL = "http://127.0.0.1:8000/"
128 | SERVE_RUN_HOST = "0.0.0.0"


--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/default_pipeline.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import List, Optional, Union
  3 | 
  4 | import torch
  5 | from transformers import PreTrainedModel, PreTrainedTokenizer
  6 | 
  7 | from llmadmin.backend.logger import get_logger
  8 | from llmadmin.backend.server.models import Response
  9 | 
 10 | from ._base import BasePipeline
 11 | from .processors import StopOnTokens
 12 | from .utils import construct_prompts, truncate_to_first_stop_token
 13 | 
 14 | logger = get_logger(__name__)
 15 | 
 16 | 
 17 | class DefaultPipeline(BasePipeline):
 18 |     """Default text generation pipeline.
 19 | 
 20 |     Args:
 21 |         model (PreTrainedModel): Hugging Face model.
 22 |         tokenizer (PreTrainedTokenizer): Hugging Face tokenizer.
 23 |         prompt_format (Optional[str], optional): Prompt format. Defaults to None.
 24 |         device (Optional[Union[str, int, torch.device]], optional): Device to place model on. Defaults to model's
 25 |             device.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         model: PreTrainedModel,
 31 |         tokenizer: PreTrainedTokenizer,
 32 |         prompt_format: Optional[str] = None,
 33 |         device: Optional[Union[str, int, torch.device]] = None,
 34 |     ) -> None:
 35 |         super().__init__(
 36 |             model=model,
 37 |             tokenizer=tokenizer,
 38 |             prompt_format=prompt_format,
 39 |             device=device,
 40 |         )
 41 | 
 42 |     def preprocess(self, prompts: List[str], **generate_kwargs):
 43 |         st = time.monotonic()
 44 |         prompt_text = construct_prompts(prompts, prompt_format=self.prompt_format)
 45 |         instruction_text = construct_prompts(prompts, prompt_format="")
 46 |         if self.tokenizer.pad_token is None:
 47 |             self.tokenizer.pad_token = self.tokenizer.eos_token
 48 |             
 49 |         inputs = self.tokenizer(
 50 |             prompt_text, return_tensors="pt", padding=True, **generate_kwargs
 51 |         ).to(self.model.device)
 52 |         if not generate_kwargs.get("return_token_type_ids", True):
 53 |             inputs.pop("token_type_ids", None)
 54 |         et = time.monotonic() - st
 55 |         return {
 56 |             "inputs": inputs,
 57 |             "instruction_text": instruction_text,
 58 |             "prompt_text": prompt_text,
 59 |             "preprocessing_time": et,
 60 |         }
 61 | 
 62 |     def forward(self, model_inputs, **generate_kwargs):
 63 |         st = time.monotonic()
 64 |         inputs = model_inputs["inputs"]
 65 |         instruction_text = model_inputs["instruction_text"]
 66 |         prompt_text = model_inputs["prompt_text"]
 67 |         preprocessing_time = model_inputs["preprocessing_time"]
 68 |         generated_sequence = self.model.generate(
 69 |             **{
 70 |                 **inputs,
 71 |                 **generate_kwargs,
 72 |             }
 73 |         )
 74 |         et = time.monotonic() - st
 75 |         return {
 76 |             "inputs": inputs,
 77 |             "generated_sequence": generated_sequence,
 78 |             "instruction_text": instruction_text,
 79 |             "prompt_text": prompt_text,
 80 |             "preprocessing_time": preprocessing_time,
 81 |             "generation_time": et,
 82 |             "generate_kwargs": generate_kwargs,
 83 |         }
 84 | 
 85 |     def postprocess(self, model_outputs, **postprocess_kwargs) -> List[Response]:
 86 |         st = time.monotonic()
 87 |         tokens = model_outputs["generated_sequence"]
 88 |         input_ids = model_outputs["inputs"]["input_ids"]
 89 |         token_stopper = next(
 90 |             (
 91 |                 x
 92 |                 for x in model_outputs["generate_kwargs"].get("stopping_criteria", [])
 93 |                 if isinstance(x, StopOnTokens)
 94 |             ),
 95 |             None,
 96 |         )
 97 |         decoded: List[Response] = []
 98 |         num_generated_tokens_batch = 0
 99 |         num_input_tokens_batch = 0
100 |         for token_unwrapped, inputs_unwrapped in zip(tokens, input_ids):
101 |             logger.info(
102 |                 f"Unprocessed generated tokens: '{self.tokenizer.decode(token_unwrapped, skip_special_tokens=False).encode('unicode_escape').decode('utf-8')}'"
103 |             )
104 |             tokens = token_unwrapped[len(inputs_unwrapped) :]
105 |             if token_stopper:
106 |                 tokens = truncate_to_first_stop_token(
107 |                     tokens, token_stopper.stopping_sequences
108 |                 )
109 |             text = (
110 |                 self.tokenizer.decode(tokens, skip_special_tokens=True)
111 |                 .replace("\u200b", "")
112 |                 .strip()
113 |             )
114 |             for i in range(len(inputs_unwrapped)):
115 |                 if inputs_unwrapped[i] != self.tokenizer.pad_token_id:
116 |                     break
117 |             num_input_tokens = len(inputs_unwrapped[i:])
118 |             num_generated_tokens = len(tokens)
119 |             response = Response(
120 |                 generated_text=text,
121 |                 num_generated_tokens=num_generated_tokens,
122 |                 num_input_tokens=num_input_tokens,
123 |             )
124 |             num_generated_tokens_batch += num_generated_tokens
125 |             num_input_tokens_batch += num_input_tokens
126 |             decoded.append(response)
127 |         et = time.monotonic() - st
128 |         for response in decoded:
129 |             response.num_generated_tokens_batch = num_generated_tokens_batch
130 |             response.num_input_tokens_batch = num_input_tokens_batch
131 |             response.preprocessing_time = model_outputs["preprocessing_time"]
132 |             response.generation_time = model_outputs["generation_time"]
133 |             response.postprocessing_time = et
134 |         return decoded
135 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/default_transformers_pipeline.py:
--------------------------------------------------------------------------------
  1 | from typing import TYPE_CHECKING, List, Optional, Union
  2 | 
  3 | import torch
  4 | from transformers import Pipeline as TransformersPipeline
  5 | from transformers import PreTrainedModel, PreTrainedTokenizer, pipeline
  6 | 
  7 | from llmadmin.backend.logger import get_logger
  8 | from llmadmin.backend.server.models import Prompt, Response
  9 | 
 10 | from ._base import BasePipeline
 11 | from .utils import construct_prompts, construct_prompts_experimental
 12 | # from llmadmin.backend.server.utils import render_gradio_params
 13 | from .default_pipeline import DefaultPipeline
 14 | 
 15 | try:
 16 |     import transformers
 17 |     from transformers import pipelines
 18 | except ImportError as ie:
 19 |     raise ImportError(
 20 |         "transformers not installed. Please try `pip install transformers`"
 21 |     ) from ie
 22 | 
 23 | if TYPE_CHECKING:
 24 |     from ..initializers._base import LLMInitializer
 25 | 
 26 | logger = get_logger(__name__)
 27 | 
 28 | 
 29 | class DefaultTransformersPipeline(BasePipeline):
 30 |     """Text generation pipeline using Transformers Pipeline.
 31 | 
 32 |     May not support all features.
 33 | 
 34 |     Args:
 35 |         model (PreTrainedModel): Hugging Face model.
 36 |         tokenizer (PreTrainedTokenizer): Hugging Face tokenizer.
 37 |         prompt_format (Optional[str], optional): Prompt format. Defaults to None.
 38 |         device (Optional[Union[str, int, torch.device]], optional): Device to place model on. Defaults to model's
 39 |             device.
 40 |     """
 41 | 
 42 |     def __init__(
 43 |         self,
 44 |         model: PreTrainedModel,
 45 |         tokenizer: PreTrainedTokenizer,
 46 |         prompt_format: Optional[str] = None,
 47 |         device: Optional[Union[str, int, torch.device]] = None,
 48 |         task: str = None,
 49 |     ) -> None:
 50 |         if not hasattr(model, "generate"):
 51 |             raise ValueError("Model must have a generate method.")
 52 |         super().__init__(model, tokenizer, prompt_format, device)
 53 | 
 54 |         self.pipeline = None
 55 |         self.preprocess = None
 56 |         self.postprocess = None
 57 | 
 58 |     def _get_transformers_pipeline(self, **kwargs) -> TransformersPipeline:
 59 |         default_kwargs = dict(
 60 |             task="text-generation",
 61 |             model=self.model,
 62 |             tokenizer=self.tokenizer,
 63 |             device=None,
 64 |         )
 65 |         transformers_pipe = pipeline(**{**default_kwargs, **kwargs})
 66 |         transformers_pipe.device = self.device
 67 |         return transformers_pipe
 68 | 
 69 |     @torch.inference_mode()
 70 |     def __call__(self, inputs: List[Union[str, Prompt]], **kwargs) -> List[Response]:
 71 |         if not self.pipeline:
 72 |             self.pipeline = self._get_transformers_pipeline()
 73 | 
 74 |         logger.info(f"input from pipeline: ****** {inputs}")
 75 |         inputs = construct_prompts_experimental(
 76 |             inputs, prompt_format=self.prompt_format)
 77 |         
 78 |         logger.info(f"input from pipeline: ****** {inputs}")
 79 | 
 80 |         if self.preprocess:
 81 |             data = self.preprocess(inputs)
 82 | 
 83 |         logger.info(data)
 84 |         kwargs.pop("stopping_sequences", None)
 85 |         kwargs.pop("timeout_s", None)
 86 |         kwargs.pop("start_timestamp", None)
 87 |         # special cases that needs to be handled differently
 88 |         if isinstance(
 89 |             self.pipeline,
 90 |             (
 91 |                 pipelines.text_classification.TextClassificationPipeline,
 92 |                 pipelines.text2text_generation.Text2TextGenerationPipeline,
 93 |                 pipelines.text2text_generation.TranslationPipeline,
 94 |             ),
 95 |         ):
 96 |             data = self.pipeline(*data, **kwargs)
 97 |         else:
 98 |             data = self.pipeline(**data, **kwargs)
 99 | 
100 |         logger.info(f"output from pipeline: ****** {data}")
101 |         if self.postprocess:
102 |             output = self.postprocess(data)
103 | 
104 |         return output
105 | 
106 |     @classmethod
107 |     def from_initializer(
108 |         cls,
109 |         initializer: "LLMInitializer",
110 |         model_id: str,
111 |         prompt_format: Optional[str] = None,
112 |         device: Optional[Union[str, int, torch.device]] = None,
113 |         stopping_sequences: List[Union[int, str]] = None,
114 |         **kwargs,
115 |     ) -> "DefaultTransformersPipeline":
116 |         model_from_pretrained_kwargs = initializer.get_model_from_pretrained_kwargs()
117 |         default_kwargs = dict(
118 |             model=model_id,
119 |             **kwargs,
120 |             **model_from_pretrained_kwargs
121 |         )
122 | 
123 |         transformers_pipe = pipeline(
124 |             **default_kwargs,
125 |             model_kwargs=initializer.get_model_init_kwargs(),
126 |         )
127 |         # transformers_pipe.model = initializer.postprocess_model(transformers_pipe.model)
128 |         pipe = cls(
129 |             model=transformers_pipe.model,
130 |             tokenizer=transformers_pipe.tokenizer,
131 |             prompt_format=prompt_format,
132 |             device=device,
133 |             # stopping_sequences=stopping_sequences,
134 |             **kwargs,
135 |         )
136 |         pipe.pipeline = transformers_pipe
137 |         transformers_pipe.device = pipe.device
138 | 
139 |         # if "task" in kwargs:
140 |         #     pipeline_info = render_gradio_params(kwargs["task"])
141 |         #     pipe.preprocess = pipeline_info["preprocess"]
142 |         #     pipe.postprocess = pipeline_info["postprocess"]
143 | 
144 |         return pipe
145 | 
146 |     def preprocess(self, prompts: List[str], **generate_kwargs):
147 |         pass
148 | 
149 |     def forward(self, model_inputs, **generate_kwargs):
150 |         pass
151 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/noheader_AdvertiseGen.py:
--------------------------------------------------------------------------------
  1 | from ._base import Task
  2 | from transformers import AutoModel, DataCollatorForSeq2Seq
  3 | from typing import Any
  4 | import pandas as pd
  5 | import numpy as np
  6 | import jieba
  7 | from rouge_chinese import Rouge
  8 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
  9 | 
 10 | class NoheaderAdvertiseGen(Task):
 11 |     AUTO_MODEL_CLASS = AutoModel
 12 | 
 13 |     DATASET_PATH = "AdvertiseGen"
 14 | 
 15 |     def get_data_proprocess(self) -> Any:
 16 |         tokenizer = self.tokenizer
 17 |         max_length = self.ft_config.train_config.base_config.max_length
 18 |         # adopt python decorator TODO
 19 |         def preprocess_function(examples: pd.DataFrame):            
 20 |             # examples = examples.to_dict("list")
 21 |             #-- start
 22 |             max_source_length = int(max_length / 2)
 23 |             max_target_length = max_length - max_source_length
 24 |             # max_seq_length = data_args.max_source_length + data_args.max_target_length
 25 | 
 26 |             model_inputs = {
 27 |                 "input_ids": [],
 28 |                 "labels": [],
 29 |             }
 30 |             for i in range(len(examples["content"])):
 31 |                 if examples["content"][i] and examples["summary"][i]:
 32 |                     prompt, answer = examples["content"][i], examples["summary"][i]
 33 |                         
 34 |                     a_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
 35 |                     b_ids = tokenizer.encode(text=answer, add_special_tokens=False)
 36 | 
 37 |                     if len(a_ids) > max_source_length - 1:
 38 |                         a_ids = a_ids[: max_source_length - 1]
 39 | 
 40 |                     if len(b_ids) > max_target_length - 2:
 41 |                         b_ids = b_ids[: max_target_length - 2]
 42 | 
 43 |                     input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)
 44 | 
 45 |                     context_length = input_ids.index(tokenizer.bos_token_id)
 46 |                     mask_position = context_length - 1
 47 |                     labels = [-100] * context_length + input_ids[mask_position+1:]
 48 |                     
 49 |                     # pad_len = max_length - len(input_ids)
 50 |                     # input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
 51 |                     # labels = labels + [tokenizer.pad_token_id] * pad_len
 52 |                     # if data_args.ignore_pad_token_for_loss:
 53 |                     #     labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
 54 | 
 55 |                     model_inputs["input_ids"].append(input_ids)
 56 |                     model_inputs["labels"].append(labels)
 57 | 
 58 |                 
 59 |             # Add back the original columns
 60 |             ret = {**examples, **model_inputs}
 61 |             return pd.DataFrame.from_dict(ret)
 62 |         
 63 |         return preprocess_function
 64 | 
 65 |     def get_compute_metrics(self) -> Any:
 66 |         tokenizer = self.tokenizer
 67 | 
 68 |         def compute_metrics(eval_preds):
 69 |             preds, labels = eval_preds
 70 |             if isinstance(preds, tuple):
 71 |                 preds = preds[0]
 72 |             decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
 73 | 
 74 |             labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
 75 |             decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 76 | 
 77 |             score_dict = {
 78 |                 "rouge-1": [],
 79 |                 "rouge-2": [],
 80 |                 "rouge-l": [],
 81 |                 "bleu-4": []
 82 |             }
 83 |             for pred, label in zip(decoded_preds, decoded_labels):
 84 |                 hypothesis = list(jieba.cut(pred))
 85 |                 reference = list(jieba.cut(label))
 86 |                 rouge = Rouge()
 87 |                 scores = rouge.get_scores(' '.join(hypothesis) , ' '.join(reference))
 88 |                 result = scores[0]
 89 |                 
 90 |                 for k, v in result.items():
 91 |                     score_dict[k].append(round(v["f"] * 100, 4))
 92 |                 bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
 93 |                 score_dict["bleu-4"].append(round(bleu_score * 100, 4))
 94 | 
 95 |             for k, v in score_dict.items():
 96 |                 score_dict[k] = float(np.mean(v))
 97 |             return score_dict
 98 |         
 99 |         return compute_metrics
100 | 
101 |     def get_data_collator(self) -> Any:
102 |         data_collator = DataCollatorForSeq2Seq(
103 |             tokenizer=self.tokenizer,
104 |             model=self.model,
105 |             label_pad_token_id=-100,
106 |             pad_to_multiple_of=None,
107 |             padding=True
108 |         )
109 |         return data_collator
110 |     
111 |     def training_key(self):
112 |         """
113 |         :return: Iterable[obj]
114 |             A iterable of any object, that doc_to_text can handle
115 |         """
116 |         return "train"
117 | 
118 |     def validation_key(self):
119 |         """
120 |         :return: Iterable[obj]
121 |             A iterable of any object, that doc_to_text can handle
122 |         """
123 |         return "validation"
124 |     
125 |     def getTrainDataSet(self):
126 |         return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True)
127 | 
128 |     def getEvalDataSet(self):
129 |         return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True)
130 | 
131 |     def getSmallTrainDataSet(self, len: int):
132 |         return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
133 | 
134 |     def getSmallEvalDataSet(self, len: int):
135 |         return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True)
136 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # LLM - Finetune
  3 | 
  4 | The framework of training large language models，support lora, full parameters fine tune etc, define yaml to start training/fine tune of your defined models, data and methods. Easy define and easy start. A large-scale model training framework that supports tasks such as LoRA and full-parameter fine-tuning. Easily initiate your large model training and fine-tuning work by defining a YAML file specifying the base model, dataset, and training parameters. Feedback and stars⭐️ are welcome!
  5 | 
  6 | <img src="./docs/llm-finetune.png" alt="image" width=600 height="auto">
  7 | 
  8 | Two steps to run your LLM finetune:
  9 | 
 10 | ## 1. Easy Install
 11 | 
 12 | ### Installation
 13 | 
 14 | Use SHELL `bash` for command
 15 | 
 16 | ```bash
 17 | git clone https://github.com/OpenCSGs/llm-finetune.git
 18 | cd llm-finetune
 19 | pip install .  # Install from CN: 'pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple'
 20 | ```
 21 | 
 22 | ## 2. Easy Run
 23 | ### Finetune model by command
 24 | 
 25 | ```
 26 | llmfinetune run-ft --ft=./models/ft--sequenceclassification--bert-base-uncased-lora.yaml
 27 | ```
 28 | 
 29 | Your Finetune task is starting now. 
 30 | 
 31 | - You can add more yaml for your own to define your tasks.
 32 | 
 33 | *******
 34 | ## (Optional) Addtional launcher: Launch by accelerate or deepspeed
 35 | Ensure accelerate and deepspeed are installed, then follow below steps.
 36 | 
 37 | ### Launch by accelerate
 38 | 
 39 | Modify parameters of `accelerate launch` for distributed train.
 40 | 
 41 | #### Finetune on CPU
 42 | 
 43 | ```
 44 | # Use CPU
 45 | accelerate launch --cpu --num_machines=1 --num_processes=1 --num_cpu_threads_per_process=1 --mixed_precision=no --dynamo_backend=no llm_finetune.py run-ft --ft=/Users/hub/code/jihulab/opencsg/llm-inference/models/ft--sequenceclassification--bert-base-cased.yaml
 46 | ```
 47 | 
 48 | #### Finetune on GPU on single host
 49 | 
 50 | Control GPU Visibility with `CUDA_VISIBLE_DEVICES`. 
 51 | 
 52 | ```
 53 | # Use GPU:0
 54 | CUDA_VISIBLE_DEVICES=0 accelerate launch llmfinetune run-ft --ft=./models/ft--sequenceclassification--bert-base-uncased-lora.yaml
 55 | 
 56 | # Use GPU:1 
 57 | CUDA_VISIBLE_DEVICES=1 accelerate launch llmfinetune run-ft --ft=./models/ft--sequenceclassification--bert-base-uncased-lora.yaml
 58 | 
 59 | # Use GPU:0
 60 | accelerate launch --num_machines=1 --num_processes=1 --gpu_ids=0 llmfinetune ...
 61 | ```
 62 | 
 63 | #### Finetune on multi-GPUs on single host
 64 | 
 65 | ```
 66 | # Use all GPUs with mixed precision disabled
 67 | accelerate launch --multi_gpu llmfinetune ...
 68 | 
 69 | # Use all GPUs with mxied precision
 70 | accelerate launch --multi_gpu --mixed_precision=fp16 llmfinetune ...
 71 | 
 72 | # Use GPU:0 and GPU:1
 73 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --multi_gpu --gpu_ids=0,1 llmfinetune ...
 74 | 
 75 | # Launch with 2 GPUs
 76 | accelerate launch --multi_gpu --num_processes 2 llmfinetune ...
 77 | ```
 78 | 
 79 | ```
 80 | # Use default_config.yaml
 81 | compute_environment: LOCAL_MACHINE
 82 | deepspeed_config: {}
 83 | distributed_type: MULTI_GPU
 84 | downcast_bf16: 'no'
 85 | machine_rank: 'NO'
 86 | fsdp_config: {}
 87 | gpu_ids: all                  # all GPUs id
 88 | machine_rank: 0
 89 | main_training_function: main
 90 | megatron_lm_config: {}
 91 | mixed_precision: fp16         # mixed precsion
 92 | num_machines: 1               # a single machine
 93 | num_processes: 4              # 4 GPUs
 94 | rdzv_backend: static
 95 | same_network: true
 96 | use_cpu: false
 97 | 
 98 | 
 99 | accelerate launch --config_file default_config.yaml llmfinetune ...
100 | ```
101 | 
102 | #### Finetune on multi-GPUs on multi-hosts
103 | 
104 | All hosts need access without password each other
105 | 
106 | ```
107 | # default_config.yaml
108 | compute_environment: LOCAL_MACHINE
109 | deepspeed_config:
110 |   deepspeed_multinode_launcher: standard
111 |   gradient_accumulation_steps: 1
112 |   gradient_clipping: 1.0
113 |   offload_optimizer_device: none
114 |   offload_param_device: none
115 |   zero3_init_flag: true
116 |   zero3_save_16bit_model: true
117 |   zero_stage: 3
118 | distributed_type: DEEPSPEED
119 | downcast_bf16: 'no'
120 | dynamo_config: {}
121 | fsdp_config: {}
122 | main_training_function: main
123 | megatron_lm_config: {}
124 | mixed_precision: fp16
125 | num_machines: 2               # 2 nodes
126 | num_processes: 16             # 16 GPUs of all nodes 
127 | tpu_env: []
128 | tpu_use_cluster: false
129 | tpu_use_sudo: false
130 | use_cpu: false
131 | 
132 | # Run on all hosts by specify `RANK`, `MASTER_ADDR`, `MASTER_PORT`
133 | accelerate launch --config_file default_config.yaml \
134 |     --machine_rank ${RANK} \
135 |     --main_process_ip ${MASTER_ADDR} \
136 |     --main_process_port ${MASTER_PORT} \ 
137 |     ...
138 | 
139 | # --machine_rank: 0 for the main/master node, for other nodes is 1,2,3 etc.
140 | ```
141 | 
142 | #### Finetune by Deepspeed for multi-GPUs on multi-hosts
143 | 
144 | All hosts need access without password each other
145 | 
146 | ```
147 | # myhostfile
148 | node1 slots=1
149 | node2 slots=1
150 | 
151 | # deepspeed.json
152 | {
153 |     "train_batch_size": "auto",
154 |     "train_micro_batch_size_per_gpu": "auto",
155 |     "gradient_accumulation_steps": "auto",
156 |     "gradient_clipping": "auto",
157 |     "zero_allow_untested_optimizer": true,
158 |     "fp16": {
159 |       "enabled": "auto",
160 |       "loss_scale": 0,
161 |       "initial_scale_power": 16,
162 |       "loss_scale_window": 1000,
163 |       "hysteresis": 2,
164 |       "min_loss_scale": 1
165 |     },
166 |     "zero_optimization": {
167 |       "stage": 2,
168 |       "allgather_partitions": true,
169 |       "allgather_bucket_size": 5e8,
170 |       "reduce_scatter": true,
171 |       "reduce_bucket_size": 5e8,
172 |       "overlap_comm": false,
173 |       "contiguous_gradients": true
174 |     }
175 | }
176 | 
177 | deepspeed --num_nodes=2 --hostfile=myhostfile --deepspeed deepspeed.json ...
178 | 
179 | # --num_nodes: num of host
180 | # --hostfile:  host file include all hosts IP and num of GPUs
181 | # --deepspeed: deepspeed config file
182 | 
183 | ```
184 | 
185 | 
186 | 
187 | 


--------------------------------------------------------------------------------
/llmadmin/common/backend.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from abc import ABC, abstractmethod
  4 | from typing import Any, Dict, List, Union
  5 | 
  6 | import requests
  7 | 
  8 | from llmadmin.common.constants import TIMEOUT
  9 | 
 10 | 
 11 | class BackendError(RuntimeError):
 12 |     def __init__(self, *args: object, **kwargs) -> None:
 13 |         self.response = kwargs.pop("response", None)
 14 |         super().__init__(*args)
 15 | 
 16 | logger = logging.getLogger("ray.logger")
 17 | 
 18 | def get_llmadmin_backend(url: str = "http://127.0.0.1:8000/cmp_models_default"):
 19 |     """
 20 |     Establishes a connection to the LLMAdmin backed after establishing
 21 |     the information using environmental variables.
 22 |     If the AVIARY_MOCK environmental variable is set, then a mock backend is used.
 23 | 
 24 |     For direct connection to the llmadmin backend (e.g. running on the same cluster),
 25 |     no AVIARY_TOKEN is required. Otherwise, the AVIARY_URL and AVIARY_TOKEN environmental variables
 26 |     are required.
 27 | 
 28 |     Returns:
 29 |         backend: An instance of the Backend class.
 30 |     """
 31 |     mock_backend = os.getenv("AVIARY_MOCK", False)
 32 |     if mock_backend:
 33 |         backend = MockBackend()
 34 |         return backend
 35 |     print(os.getenv("AVIARY_URL"))
 36 |     llmadmin_url = url 
 37 |     assert llmadmin_url is not None, "AVIARY_URL must be set"
 38 |     backend_token = os.getenv("AVIARY_TOKEN")
 39 |     bearer = f"Bearer {backend_token}" if backend_token is not None else ""
 40 |     if not llmadmin_url.endswith("/"):
 41 |         llmadmin_url += "/"
 42 |     print("Connecting to LLMAdmin backend at: ", llmadmin_url)
 43 |     backend = LLMAdminBackend(llmadmin_url, bearer)
 44 |     return backend
 45 | 
 46 | 
 47 | class Backend(ABC):
 48 |     """Abstract interface for talking to LLMAdmin."""
 49 | 
 50 |     @abstractmethod
 51 |     def models(self) -> List[str]:
 52 |         pass
 53 | 
 54 |     @abstractmethod
 55 |     def metadata(self, llm: str) -> Dict[str, Dict[str, Any]]:
 56 |         pass
 57 | 
 58 |     @abstractmethod
 59 |     def completions(self, prompt: str, llm: str) -> Dict[str, Union[str, float, int]]:
 60 |         pass
 61 | 
 62 |     @abstractmethod
 63 |     def batch_completions(
 64 |         self, prompts: List[str], llm: str
 65 |     ) -> List[Dict[str, Union[str, float, int]]]:
 66 |         pass
 67 | 
 68 | 
 69 | class LLMAdminBackend(Backend):
 70 |     """Interface for talking to LLMAdmin.
 71 |     Deliberately designed to be similar to OpenAI's
 72 |     Completions interface.
 73 | 
 74 |     https://platform.openai.com/docs/api-reference/completions?lang=python
 75 |     """
 76 | 
 77 |     def __init__(self, backend_url: str, bearer: str):
 78 |         assert "::param" not in backend_url, "backend_url not set correctly"
 79 |         assert "::param" not in bearer, "bearer not set correctly"
 80 | 
 81 |         self.backend_url = backend_url
 82 |         self.bearer = bearer
 83 |         self.header = {"Authorization": self.bearer}
 84 | 
 85 |     def models(self) -> List[str]:
 86 |         url = self.backend_url + "models"
 87 |         print("Connecting backend to get models at: ", url)
 88 |         response = requests.get(url, headers=self.header, timeout=TIMEOUT)
 89 |         try:
 90 |             result = response.json()
 91 |         except requests.JSONDecodeError as e:
 92 |             raise BackendError(
 93 |                 f"Error decoding JSON from {url}. Text response: {response.text}",
 94 |                 response=response,
 95 |             ) from e
 96 |         return result
 97 | 
 98 |     def metadata(self, llm: str) -> Dict[str, Dict[str, Any]]:
 99 |         url = self.backend_url + "metadata/" + llm.replace("/", "--")
100 |         response = requests.get(url, headers=self.header, timeout=TIMEOUT)
101 |         try:
102 |             result = response.json()
103 |         except requests.JSONDecodeError as e:
104 |             raise BackendError(
105 |                 f"Error decoding JSON from {url}. Text response: {response.text}",
106 |                 response=response,
107 |             ) from e
108 |         return result
109 | 
110 |     def completions(self, prompt: str, llm: str) -> Dict[str, Union[str, float, int]]:
111 |         url = self.backend_url + "query/" + llm.replace("/", "--")
112 |         response = requests.post(
113 |             url,
114 |             headers=self.header,
115 |             json={"prompt": prompt},
116 |             timeout=TIMEOUT,
117 |         )
118 |         try:
119 |             return response.json()[llm]
120 |         except requests.JSONDecodeError as e:
121 |             raise BackendError(
122 |                 f"Error decoding JSON from {url}. Text response: {response.text}",
123 |                 response=response,
124 |             ) from e
125 | 
126 |     def batch_completions(
127 |         self, prompts: List[str], llm: str
128 |     ) -> List[Dict[str, Union[str, float, int]]]:
129 |         url = self.backend_url + "query/batch/" + llm.replace("/", "--")
130 |         response = requests.post(
131 |             url,
132 |             headers=self.header,
133 |             json=[{"prompt": prompt} for prompt in prompts],
134 |             timeout=TIMEOUT,
135 |         )
136 |         try:
137 |             return response.json()[llm]
138 |         except requests.JSONDecodeError as e:
139 |             raise BackendError(
140 |                 f"Error decoding JSON from {url}. Text response: {response.text}",
141 |                 response=response,
142 |             ) from e
143 | 
144 | 
145 | class MockBackend(Backend):
146 |     """Mock backend for testing"""
147 | 
148 |     def __init__(self):
149 |         pass
150 | 
151 |     def models(self) -> List[str]:
152 |         return ["A", "B", "C"]
153 | 
154 |     def metadata(self, llm: str) -> Dict[str, Dict[str, Any]]:
155 |         return {
156 |             "metadata": {
157 |                 "model_config": {
158 |                     "model_id": llm,
159 |                     "model_url": f"https://huggingface.co/org/{llm}",
160 |                     "model_description": f"This is a model description for model {llm}",
161 |                 }
162 |             }
163 |         }
164 | 
165 |     def completions(self, prompt: str, llm: str) -> Dict[str, Union[str, float, int]]:
166 |         return {
167 |             "generated_text": prompt,
168 |             "total_time": 99,
169 |             "num_total_tokens": 42.3,
170 |         }
171 | 
172 |     def batch_completions(
173 |         self, prompts: List[str], llm: str
174 |     ) -> List[Dict[str, Union[str, float, int]]]:
175 |         return [
176 |             {
177 |                 "generated_text": prompt,
178 |                 "total_time": 99,
179 |                 "num_total_tokens": 42.3,
180 |             }
181 |             for prompt in prompts
182 |         ]
183 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/pipelines/llamacpp/llamacpp_pipeline.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
  3 | 
  4 | import torch
  5 | 
  6 | from llmadmin.backend.logger import get_logger
  7 | from llmadmin.backend.server.models import Response
  8 | 
  9 | from ...initializers.llamacpp import LlamaCppInitializer, LlamaCppTokenizer
 10 | from .._base import StreamingPipeline
 11 | from ..utils import decode_stopping_sequences_where_needed, construct_prompts
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from llama_cpp import Llama, LogitsProcessorList, StoppingCriteriaList
 15 | 
 16 | logger = get_logger(__name__)
 17 | 
 18 | 
 19 | class LlamaCppPipeline(StreamingPipeline):
 20 |     """Text generation pipeline using llama.cpp.
 21 | 
 22 |     May not support all features."""
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         model: "Llama",
 27 |         tokenizer: LlamaCppTokenizer,
 28 |         prompt_format: Optional[str] = None,
 29 |         device: Optional[Union[str, int, torch.device]] = None,
 30 |         **kwargs,
 31 |     ) -> None:
 32 |         from llama_cpp import Llama
 33 | 
 34 |         if not isinstance(model, Llama):
 35 |             raise TypeError("Model must be an instance of llama_cpp.Llama.")
 36 |         self.model = model
 37 |         self.kwargs = kwargs
 38 |         self.tokenizer = tokenizer
 39 |         self.device = device
 40 |         self.prompt_format = prompt_format
 41 | 
 42 |     def _get_logits_processors(
 43 |         self, generate_kwargs: Dict[str, Any], model_inputs=None
 44 |     ) -> "LogitsProcessorList":
 45 |         from llama_cpp import LogitsProcessorList
 46 | 
 47 |         from llmadmin.backend.llm.pipelines.llamacpp.processors import (
 48 |             LlamaCppMinNewTokensLengthLogitsProcessor,
 49 |         )
 50 | 
 51 |         lst = []
 52 | 
 53 |         if "min_new_tokens" in generate_kwargs:
 54 |             lst.append(
 55 |                 LlamaCppMinNewTokensLengthLogitsProcessor(
 56 |                     prompt_length_to_skip=len(model_inputs["tokenized_inputs"]),
 57 |                     min_new_tokens=generate_kwargs.pop("min_new_tokens", 4),
 58 |                     eos_token_id=self.model.token_eos(),
 59 |                 )
 60 |             )
 61 | 
 62 |         return LogitsProcessorList(lst)
 63 | 
 64 |     def _get_stopping_criteria(
 65 |         self, generate_kwargs: Dict[str, Any], model_inputs=None
 66 |     ) -> "StoppingCriteriaList":
 67 |         from llama_cpp import StoppingCriteriaList
 68 | 
 69 |         from llmadmin.backend.llm.pipelines.llamacpp.processors import (
 70 |             LlamaMaxTimeCriteria,
 71 |         )
 72 | 
 73 |         lst = []
 74 | 
 75 |         timeout_s = generate_kwargs.pop("timeout_s", None)
 76 |         start_timestamp = generate_kwargs.pop("start_timestamp", None)
 77 |         if timeout_s is not None and start_timestamp is not None:
 78 |             lst.append(LlamaMaxTimeCriteria(timeout_s, start_timestamp))
 79 | 
 80 |         return StoppingCriteriaList(lst)
 81 | 
 82 |     def _add_default_generate_kwargs(
 83 |         self, generate_kwargs: Dict[str, Any], model_inputs=None
 84 |     ) -> Dict[str, Any]:
 85 |         generate_kwargs = generate_kwargs.copy()
 86 |         generate_kwargs.setdefault("echo", False)
 87 |         stopping_sequences = generate_kwargs.pop("stopping_sequences")
 88 |         stopping_sequences = decode_stopping_sequences_where_needed(
 89 |             self.tokenizer, stopping_sequences
 90 |         )
 91 |         generate_kwargs.setdefault("stop", stopping_sequences)
 92 |         generate_kwargs["logits_processor"] = self._get_logits_processors(
 93 |             generate_kwargs, model_inputs=model_inputs
 94 |         )
 95 |         generate_kwargs["stopping_criteria"] = self._get_stopping_criteria(
 96 |             generate_kwargs, model_inputs=model_inputs
 97 |         )
 98 |         return generate_kwargs
 99 | 
100 |     def __call__(self, inputs: List[str], **kwargs) -> List[Response]:
101 |         logger.info(inputs)
102 |         inputs = construct_prompts(
103 |             inputs, prompt_format=self.prompt_format)
104 |         
105 |         logger.info(inputs)
106 |         tokenized_inputs = self.tokenizer.encode(inputs[0])
107 |         kwargs = self._add_default_generate_kwargs(
108 |             kwargs,
109 |             model_inputs={"inputs": inputs, "tokenized_inputs": tokenized_inputs},
110 |         )
111 | 
112 |         logger.info(f"Forward params: {kwargs}, model_inputs {inputs}")
113 |         responses = []
114 |         for input in inputs:
115 |             st = time.monotonic()
116 |             output = self.model(input, **kwargs)
117 |             gen_time = time.monotonic() - st
118 |             text = output["choices"][0]["text"].replace("\u200b", "").strip()
119 |             responses.append(
120 |                 Response(
121 |                     generated_text=text,
122 |                     num_generated_tokens=output["usage"]["completion_tokens"],
123 |                     num_input_tokens=output["usage"]["prompt_tokens"],
124 |                     num_generated_tokens_batch=output["usage"]["completion_tokens"],
125 |                     num_input_tokens_batch=output["usage"]["prompt_tokens"],
126 |                     preprocessing_time=None,
127 |                     postprocessing_time=None,
128 |                     generation_time=gen_time,
129 |                 )
130 |             )
131 |         return responses
132 | 
133 |     def stream(
134 |         self,
135 |         inputs: List[str],
136 |         **kwargs,
137 |     ) -> Iterator[torch.LongTensor]:
138 |         tokenized_inputs = self.tokenizer.encode(inputs[0])
139 |         kwargs = self._add_default_generate_kwargs(
140 |             kwargs,
141 |             model_inputs={"inputs": inputs, "tokenized_inputs": tokenized_inputs},
142 |         )
143 | 
144 |         logger.info(f"Forward params: {kwargs}, model_inputs {inputs}")
145 |         first_token_done = False
146 |         for input in inputs:
147 |             for output in self.model(input, stream=True, **kwargs):
148 |                 st = time.monotonic()
149 |                 gen_time = time.monotonic() - st
150 |                 text = output["choices"][0]["text"].replace("\u200b", "")
151 |                 if not first_token_done:
152 |                     text = text.lstrip()
153 |                     first_token_done = True
154 |                 yield [
155 |                     Response(
156 |                         generated_text=text,
157 |                         num_generated_tokens=1,
158 |                         num_input_tokens=len(tokenized_inputs),
159 |                         num_generated_tokens_batch=1,
160 |                         num_input_tokens_batch=len(tokenized_inputs),
161 |                         preprocessing_time=None,
162 |                         postprocessing_time=None,
163 |                         generation_time=gen_time,
164 |                     )
165 |                 ]
166 | 
167 |     def preprocess(self, prompts: List[str], **generate_kwargs):
168 |         pass
169 | 
170 |     def forward(self, model_inputs, **generate_kwargs):
171 |         pass
172 |     
173 |     @classmethod
174 |     def from_initializer(
175 |         cls,
176 |         initializer: "LlamaCppInitializer",
177 |         model_id: str,
178 |         device: Optional[Union[str, int, torch.device]] = None,
179 |         **kwargs,
180 |     ) -> "LlamaCppPipeline":
181 |         assert isinstance(initializer, LlamaCppInitializer)
182 |         logger.info(f"LlamaCppPipeline initializer loading model: {model_id}")
183 |         model, tokenizer = initializer.load(model_id)
184 |         logger.info(f"LlamaCppPipeline loaded model: {model}")
185 |         return cls(
186 |             model,
187 |             tokenizer,
188 |             device=device,
189 |             **kwargs,
190 |         )


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/tasks/text_generation_AdvertiseGen.py:
--------------------------------------------------------------------------------
  1 | from ._base import Task
  2 | from transformers import AutoModel, DataCollatorForSeq2Seq, AutoModelForCausalLM
  3 | from typing import Any
  4 | import pandas as pd
  5 | import numpy as np
  6 | import jieba
  7 | from rouge_chinese import Rouge
  8 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
  9 | 
 10 | class NoheaderAdvertiseGen(Task):
 11 |     # AUTO_MODEL_CLASS = AutoModel
 12 |     AUTO_MODEL_CLASS = AutoModelForCausalLM
 13 | 
 14 |     DATASET_PATH = "AdvertiseGen"
 15 |     prompt_column = "content"
 16 |     response_column = "summary"
 17 |     # history_column = "history"
 18 | 
 19 |     def get_data_proprocess(self) -> Any:
 20 |         self.prompt_column = self.ft_config.data_config.input_columns[0]
 21 |         self.response_column = self.ft_config.data_config.validation_column
 22 |         self.DATASET_PATH = self.ft_config.data_config.data_path
 23 |         tokenizer = self.tokenizer
 24 |         max_length = self.ft_config.train_config.base_config.max_length
 25 |         # adopt python decorator TODO
 26 |         def preprocess_function(examples):            
 27 |             # examples = examples.to_dict("list")
 28 |             #-- start
 29 |             max_source_length = int(max_length / 2)
 30 |             max_target_length = max_length - max_source_length
 31 |             max_source_length = 64
 32 |             max_target_length = 128
 33 |             max_seq_length = max_source_length + max_target_length + 1
 34 | 
 35 |             model_inputs = {
 36 |                 "input_ids": [],
 37 |                 "labels": [],
 38 |             }
 39 |             prefix = ""
 40 |             for i in range(len(examples[self.prompt_column])):
 41 |                 if examples[self.prompt_column][i] and examples[self.response_column][i]:
 42 |                     query, answer = examples[self.prompt_column][i], examples[self.response_column][i]
 43 | 
 44 |                     # history = examples[history_column][i] if history_column is not None else None
 45 |                     # history = None
 46 |                     # prompt = tokenizer.build_prompt(query, history)
 47 | 
 48 |                     prompt = prefix + query
 49 |                     print(f"tokenizer is: {tokenizer}")
 50 |                     a_ids = tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True, padding=True,
 51 |                                             max_length=max_source_length)
 52 |                     b_ids = tokenizer.encode(text=answer, add_special_tokens=False, truncation=True, padding=True,
 53 |                                             max_length=max_target_length)
 54 | 
 55 |                     context_length = len(a_ids)
 56 |                     input_ids = a_ids + b_ids + [tokenizer.eos_token_id]
 57 |                     labels = [tokenizer.pad_token_id] * context_length + b_ids + [tokenizer.eos_token_id]
 58 |                     
 59 |                     pad_len = max_seq_length - len(input_ids)
 60 |                     input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
 61 |                     labels = labels + [tokenizer.pad_token_id] * pad_len
 62 |                     
 63 |                     # if data_args.ignore_pad_token_for_loss:
 64 |                         # labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
 65 |                     # labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
 66 |                     
 67 |                     model_inputs["input_ids"].append(input_ids)
 68 |                     model_inputs["labels"].append(labels)
 69 | 
 70 |             return model_inputs
 71 |         
 72 |         return preprocess_function
 73 | 
 74 |     def get_eval_preprocess(self) -> Any:
 75 |         tokenizer = self.tokenizer
 76 |         def preprocess_function_eval(examples):
 77 |             max_source_length = 64
 78 |             max_target_length = 128
 79 |             inputs, targets = [], []
 80 |             prefix = ""
 81 |             for i in range(len(examples[self.prompt_column])):
 82 |                 if examples[self.prompt_column][i] and examples[self.response_column][i]:
 83 |                     query = examples[self.prompt_column][i]
 84 |                     # history = examples[history_column][i] if history_column is not None else None
 85 |                     # history = None
 86 |                     # prompt = tokenizer.build_prompt(query, history)
 87 |                     inputs.append(query)
 88 |                     targets.append(examples[self.response_column][i])
 89 | 
 90 |             inputs = [prefix + inp for inp in inputs]
 91 |             model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True, padding=True)
 92 |             labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
 93 | 
 94 |             # if data_args.ignore_pad_token_for_loss:
 95 |             #     labels["input_ids"] = [
 96 |             #         [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
 97 |             #     ]
 98 |             model_inputs["labels"] = labels["input_ids"]
 99 | 
100 |             return model_inputs
101 |         
102 |         return preprocess_function_eval
103 | 
104 |     def get_compute_metrics(self) -> Any:
105 |         tokenizer = self.tokenizer
106 | 
107 |         def compute_metrics(eval_preds):
108 |             preds, labels = eval_preds
109 |             if isinstance(preds, tuple):
110 |                 preds = preds[0]
111 |             decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
112 | 
113 |             labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
114 |             decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
115 | 
116 |             score_dict = {
117 |                 "rouge-1": [],
118 |                 "rouge-2": [],
119 |                 "rouge-l": [],
120 |                 "bleu-4": []
121 |             }
122 |             for pred, label in zip(decoded_preds, decoded_labels):
123 |                 hypothesis = list(jieba.cut(pred))
124 |                 reference = list(jieba.cut(label))
125 |                 rouge = Rouge()
126 |                 scores = rouge.get_scores(' '.join(hypothesis) , ' '.join(reference))
127 |                 result = scores[0]
128 |                 
129 |                 for k, v in result.items():
130 |                     score_dict[k].append(round(v["f"] * 100, 4))
131 |                 bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
132 |                 score_dict["bleu-4"].append(round(bleu_score * 100, 4))
133 | 
134 |             for k, v in score_dict.items():
135 |                 score_dict[k] = float(np.mean(v))
136 |             return score_dict
137 |         
138 |         return compute_metrics
139 | 
140 |     def get_data_collator(self) -> Any:
141 |         # label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
142 |         label_pad_token_id = self.tokenizer.pad_token_id
143 |         data_collator = DataCollatorForSeq2Seq(
144 |             tokenizer=self.tokenizer,
145 |             model=self.model,
146 |             label_pad_token_id=label_pad_token_id,
147 |             pad_to_multiple_of=None,
148 |             # padding=True
149 |             padding=False
150 |         )
151 |         return data_collator
152 |     
153 |     def training_key(self):
154 |         """
155 |         :return: Iterable[obj]
156 |             A iterable of any object, that doc_to_text can handle
157 |         """
158 |         return "train"
159 | 
160 |     def validation_key(self):
161 |         """
162 |         :return: Iterable[obj]
163 |             A iterable of any object, that doc_to_text can handle
164 |         """
165 |         return "validation"
166 |     
167 |     def getTrainDataSet(self):
168 |         return self.dataset[self.training_key()].map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column])
169 | 
170 |     def getEvalDataSet(self):
171 |         return self.dataset[self.validation_key()].map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column])
172 | 
173 |     def getSmallTrainDataSet(self, len: int):
174 |         return self.dataset[self.training_key()].select(range(len)).map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column])
175 | 
176 |     def getSmallEvalDataSet(self, len: int):
177 |         return self.dataset[self.validation_key()].select(range(len)).map(self.get_data_proprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column])
178 |         # return self.dataset[self.validation_key()].select(range(len)).map(self.get_eval_preprocess(), batched=True, remove_columns=[self.prompt_column, self.response_column])
179 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/ray_train.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from ._base import BaseFT
  6 | from llmadmin.backend.server.models import FTApp
  7 | 
  8 | from datasets import load_dataset
  9 | from transformers import AutoTokenizer
 10 | import ray.data
 11 | import torch
 12 | import numpy as np
 13 | 
 14 | from datasets import load_metric
 15 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
 16 | 
 17 | import ray.train
 18 | from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback
 19 | from ray.train.torch import TorchTrainer
 20 | from ray.train import RunConfig, ScalingConfig, CheckpointConfig, FailureConfig
 21 | from llmadmin.backend.logger import get_logger
 22 | 
 23 | logger = get_logger(__name__)
 24 | 
 25 | # GLUE_TASKS = [
 26 | #     "cola",
 27 | #     "mnli",
 28 | #     "mnli-mm",
 29 | #     "mrpc",
 30 | #     "qnli",
 31 | #     "qqp",
 32 | #     "rte",
 33 | #     "sst2",
 34 | #     "stsb",
 35 | #     "wnli",
 36 | # ]
 37 | 
 38 | class RayTrain(BaseFT):
 39 |     
 40 |     def __init__(self, ftApp: FTApp):
 41 |         self.init_model_dataset()
 42 |         super().__init__(ftapp=ftApp)
 43 |     
 44 |     def init_model_dataset(self):
 45 |         self.use_gpu = False  # set this to False to run on CPUs
 46 |         self.num_workers = 2  # set this to number of GPUs or CPUs you want to use
 47 |         logger.info(f"Is CUDA available: {torch.cuda.is_available()}")
 48 |         logger.info(f"init model and dataset with num_workers={self.num_workers}, use_gpu={self.use_gpu}")
 49 |         self.task_to_keys = {
 50 |             "cola": ("sentence", None),
 51 |             "mnli": ("premise", "hypothesis"),
 52 |             "mnli-mm": ("premise", "hypothesis"),
 53 |             "mrpc": ("sentence1", "sentence2"),
 54 |             "qnli": ("question", "sentence"),
 55 |             "qqp": ("question1", "question2"),
 56 |             "rte": ("sentence1", "sentence2"),
 57 |             "sst2": ("sentence", None),
 58 |             "stsb": ("sentence1", "sentence2"),
 59 |             "wnli": ("sentence1", "sentence2"),
 60 |         }
 61 |         self.task = "cola"
 62 |         self.actual_task = "mnli" if self.task == "mnli-mm" else self.task
 63 |         self.model_checkpoint = "/Users/hhwang/models/distilbert-base-uncased"
 64 |         
 65 |         logger.info(f"begin load model {self.model_checkpoint}")
 66 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint, use_fast=True)
 67 |         self.num_labels = 3 if self.task.startswith("mnli") else 1 if self.task == "stsb" else 2
 68 |         self.batch_size = 2
 69 |         
 70 |         dataset_path = "glue"
 71 |         logger.info(f"begin load dataset {dataset_path} -> {self.actual_task}")
 72 |         datasets = load_dataset(dataset_path, self.actual_task)
 73 |         logger.info(f"loaded datasets: {datasets}")
 74 |         item_count = 20
 75 |         logger.info(f"convert {item_count} records to ray dataset")
 76 |         self.ray_datasets = {
 77 |             "train": ray.data.from_huggingface(datasets["train"].select(range(item_count))),
 78 |             "validation": ray.data.from_huggingface(datasets["validation"].select(range(item_count))),
 79 |             "test": ray.data.from_huggingface(datasets["test"].select(range(item_count))),
 80 |         }
 81 |         self.train_count = self.ray_datasets["train"].count()
 82 |         self.validation_count = self.ray_datasets["validation"].count()
 83 |         self.test_count = self.ray_datasets["test"].count()
 84 |         logger.info(f"dataset      train count: {self.train_count}")
 85 |         logger.info(f"dataset validation count: {self.validation_count}")
 86 |         logger.info(f"dataset       test count: {self.test_count}")
 87 |         model_name = self.model_checkpoint.split("/")[-1]
 88 |         self.name = f"{model_name}-finetuned-{self.task}"
 89 |         logger.info(f"output model dir: {self.name}")
 90 |     
 91 |     # Tokenize input sentences
 92 |     def collate_fn(self, examples: Dict[str, np.array]):
 93 |         sentence1_key, sentence2_key = self.task_to_keys[self.task]
 94 |         if sentence2_key is None:
 95 |             outputs = self.tokenizer(
 96 |                 list(examples[sentence1_key]),
 97 |                 truncation=True,
 98 |                 padding="longest",
 99 |                 return_tensors="pt",
100 |             )
101 |         else:
102 |             outputs = self.tokenizer(
103 |                 list(examples[sentence1_key]),
104 |                 list(examples[sentence2_key]),
105 |                 truncation=True,
106 |                 padding="longest",
107 |                 return_tensors="pt",
108 |             )
109 |         outputs["labels"] = torch.LongTensor(examples["label"])
110 |         
111 |         if self.use_gpu:
112 |             # Move all input tensors to GPU
113 |             for key, value in outputs.items():
114 |                 outputs[key] = value.cuda()
115 | 
116 |         return outputs
117 | 
118 |     def train_func(self, config):
119 |         # Calculate the maximum steps per epoch based on the number of rows in the training dataset.
120 |         # Make sure to scale by the total number of training workers and the per device batch size.
121 |         max_steps_per_epoch = self.ray_datasets["train"].count() // (self.batch_size * self.num_workers)
122 |         logger.info(f"max_steps_per_epoch: {max_steps_per_epoch}, batch_size: {self.batch_size}, num_workers: {self.num_workers}")
123 | 
124 |         # metric = load_metric("glue", self.actual_task)
125 |         tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint, use_fast=True)
126 |         model = AutoModelForSequenceClassification.from_pretrained(
127 |             self.model_checkpoint, num_labels=self.num_labels
128 |         )
129 | 
130 |         train_ds = ray.train.get_dataset_shard("train")
131 |         eval_ds = ray.train.get_dataset_shard("eval")
132 | 
133 |         train_ds_iterable = train_ds.iter_torch_batches(
134 |             batch_size=self.batch_size, collate_fn=self.collate_fn
135 |         )
136 |         eval_ds_iterable = eval_ds.iter_torch_batches(
137 |             batch_size=self.batch_size, collate_fn=self.collate_fn
138 |         )
139 | 
140 |         args = TrainingArguments(
141 |             self.name,
142 |             evaluation_strategy="epoch",
143 |             save_strategy="epoch",
144 |             logging_strategy="epoch",
145 |             per_device_train_batch_size=self.batch_size,
146 |             per_device_eval_batch_size=self.batch_size,
147 |             learning_rate=config.get("learning_rate", 2e-5),
148 |             num_train_epochs=config.get("epochs", 2),
149 |             weight_decay=config.get("weight_decay", 0.01),
150 |             push_to_hub=False,
151 |             max_steps=max_steps_per_epoch * config.get("epochs", 2),
152 |             disable_tqdm=True,  # declutter the output a little
153 |             use_cpu=not self.use_gpu,  # you need to explicitly set no_cuda if you want CPUs
154 |             report_to="none",
155 |         )
156 | 
157 |         # def compute_metrics(eval_pred):
158 |         #     predictions, labels = eval_pred
159 |         #     if self.task != "stsb":
160 |         #         predictions = np.argmax(predictions, axis=1)
161 |         #     else:
162 |         #         predictions = predictions[:, 0]
163 |         #     return metric.compute(predictions=predictions, references=labels)
164 | 
165 |         trainer = Trainer(
166 |             model,
167 |             args,
168 |             train_dataset=train_ds_iterable,
169 |             eval_dataset=eval_ds_iterable,
170 |             tokenizer=tokenizer,
171 |             # compute_metrics=compute_metrics,
172 |         )
173 | 
174 |         trainer.add_callback(RayTrainReportCallback())
175 | 
176 |         trainer = prepare_trainer(trainer)
177 | 
178 |         logger.info("Starting training")
179 |         trainer.train()
180 | 
181 |     def train(self):
182 |         # metric_name = (
183 |         #     "pearson"
184 |         #     if self.task == "stsb"
185 |         #     else "matthews_correlation"
186 |         #     if self.task == "cola"
187 |         #     else "accuracy"
188 |         # )
189 |         
190 |         # validation_key = (
191 |         #     "validation_mismatched"
192 |         #     if self.task == "mnli-mm"
193 |         #     else "validation_matched"
194 |         #     if self.task == "mnli"
195 |         #     else "validation"
196 |         # )
197 |         logger.info(f"build ray TorchTrainer")
198 |         
199 |         trainer = TorchTrainer(
200 |             self.train_func,
201 |             scaling_config=ScalingConfig(num_workers=self.num_workers, use_gpu=self.use_gpu),
202 |             datasets={
203 |                 "train": self.ray_datasets["train"],
204 |                 "eval": self.ray_datasets["validation"],
205 |             },
206 |             run_config=RunConfig(
207 |                 checkpoint_config=CheckpointConfig(
208 |                     num_to_keep=1,
209 |                     checkpoint_score_attribute="eval_loss",
210 |                     checkpoint_score_order="min",
211 |                 ),
212 |                 failure_config=FailureConfig(
213 |                     max_failures=5
214 |                 )
215 |             ),
216 |         )
217 |         
218 |         logger.info(f"begin ray train fit")
219 |         result = trainer.fit()
220 |         logger.info(f"end ray train fit")
221 |         logger.info(f"result: {result}")
222 | 
223 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/initializers/hf_transformers/deepspeed.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from pathlib import Path
  4 | from typing import Any, Dict, Optional, Tuple
  5 | 
  6 | import deepspeed
  7 | import torch
  8 | from huggingface_hub import snapshot_download
  9 | from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedModel
 10 | 
 11 | from llmadmin.backend.logger import get_logger
 12 | 
 13 | from .base import TransformersInitializer
 14 | 
 15 | logger = get_logger(__name__)
 16 | 
 17 | 
 18 | # TODO: Allow deepspeed kwargs
 19 | class DeepSpeedInitializer(TransformersInitializer):
 20 |     """Initialize model (with DeepSpeed) and tokenizer and place them on the correct device.
 21 | 
 22 |     Args:
 23 |         device (torch.device): Device to place model and tokenizer on.
 24 |         world_size (int): Number of GPUs to use.
 25 |         dtype (torch.dtype, optional): Data type to use. Defaults to torch.float16.
 26 |         use_bettertransformer (bool, optional): Whether to use BetterTransformer. Defaults to False.
 27 |         torch_compile (Optional[Dict[str, Any]], optional): Parameters for ``torch.compile``. Defaults to None.
 28 |         max_tokens (int, optional): Maximum number of tokens to use. Defaults to 1024.
 29 |         use_kernel (bool, optional): Whether to use the DeepSpeed kernel injection. Defaults to False.
 30 |         use_meta_tensor (bool, optional): Whether to use meta tensor loading method. Defaults to False.
 31 |         injection_policy ([type], optional): Injection policy for DeepSpeed AutoTP. Cannot
 32 |             be set if use_kernel=True. Defaults to None.
 33 |         ds_inference_kwargs (Dict[str, Any], optional): Other keyword arguments for ``deepspeed.initialize``.
 34 |             Specific arguments in the signature of this function will override these values.
 35 |         **from_pretrained_kwargs: Keyword arguments for ``AutoModel.from_pretrained``.
 36 |     """
 37 | 
 38 |     def __init__(
 39 |         self,
 40 |         device: torch.device,
 41 |         world_size: int,
 42 |         dtype: torch.dtype = torch.float16,
 43 |         use_bettertransformer: bool = False,
 44 |         torch_compile: Optional[Dict[str, Any]] = None,
 45 |         max_tokens: int = 1024,
 46 |         use_kernel: bool = False,
 47 |         use_meta_tensor: bool = False,
 48 |         injection_policy=None,
 49 |         ds_inference_kwargs: Optional[Dict[str, Any]] = None,
 50 |         **from_pretrained_kwargs,
 51 |     ):
 52 |         super().__init__(
 53 |             device=device,
 54 |             world_size=world_size,
 55 |             dtype=dtype,
 56 |             use_bettertransformer=use_bettertransformer,
 57 |             torch_compile=torch_compile,
 58 |             **from_pretrained_kwargs,
 59 |         )
 60 |         self.max_tokens = max_tokens
 61 |         self.use_kernel = use_kernel
 62 |         self.use_meta_tensor = use_meta_tensor
 63 |         # TODO: Allow conversion from strings (need to do dynamic imports)
 64 |         self.injection_policy = injection_policy
 65 |         self.ds_inference_kwargs = ds_inference_kwargs
 66 | 
 67 |         if self.use_kernel:
 68 |             assert not (self.use_bettertransformer or self.torch_compile)
 69 | 
 70 |         if self.use_meta_tensor:
 71 |             assert self.use_kernel
 72 | 
 73 |     def _get_model_from_pretrained_kwargs(self):
 74 |         return dict(
 75 |             low_cpu_mem_usage=True,
 76 |             torch_dtype=self.dtype,
 77 |             **self.from_pretrained_kwargs,
 78 |         )
 79 | 
 80 |     # From https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/utils.py
 81 |     def _generate_checkpoint_json(
 82 |         self, model_id: str, checkpoint_path: Optional[str] = None
 83 |     ) -> Tuple[str, str]:
 84 |         if checkpoint_path is None:
 85 |             repo_root = snapshot_download(
 86 |                 model_id,
 87 |                 allow_patterns=["*"],
 88 |                 ignore_patterns=["*.safetensors", "*.h5", "*.msgpack"],
 89 |                 local_files_only=False,
 90 |                 revision=None,
 91 |             )
 92 |         else:
 93 |             assert os.path.exists(
 94 |                 checkpoint_path
 95 |             ), f"Checkpoint path {checkpoint_path} does not exist"
 96 |             repo_root = checkpoint_path
 97 | 
 98 |         if os.path.exists(os.path.join(repo_root, "ds_inference_config.json")):
 99 |             checkpoints_json = os.path.join(repo_root, "ds_inference_config.json")
100 |         elif model_id in [
101 |             "microsoft/bloom-deepspeed-inference-int8",
102 |             "microsoft/bloom-deepspeed-inference-fp16",
103 |         ]:
104 |             # tp presharded repos come with their own checkpoints config file
105 |             checkpoints_json = os.path.join(repo_root, "ds_inference_config.json")
106 |         else:
107 |             checkpoints_json = os.path.join(repo_root, "checkpoints.json")
108 | 
109 |             with open(checkpoints_json, "w", encoding="utf-8") as f:
110 |                 file_list = [
111 |                     str(entry).split("/")[-1]
112 |                     for entry in Path(repo_root).rglob("*.[bp][it][n]")
113 |                     if entry.is_file()
114 |                 ]
115 |                 data = {"type": "BLOOM", "checkpoints": file_list, "version": 1.0}
116 |                 json.dump(data, f)
117 | 
118 |         return os.path.abspath(repo_root), os.path.abspath(checkpoints_json)
119 | 
120 |     def load_model(self, model_id: str) -> "PreTrainedModel":
121 |         model_id_or_path = self._get_model_location_on_disk(model_id)
122 |         from_pretrained_kwargs = self._get_model_from_pretrained_kwargs()
123 | 
124 |         logger.info(f"Loading model {model_id_or_path}...")
125 |         if self.use_meta_tensor:
126 |             logger.info("Loading model using DeepSpeed meta tensor...")
127 | 
128 |             try:
129 |                 config = AutoConfig.from_pretrained(
130 |                     model_id_or_path, **from_pretrained_kwargs
131 |                 )
132 |             except OSError:
133 |                 if model_id_or_path != model_id:
134 |                     logger.warning(
135 |                         f"Couldn't load model from derived path {model_id_or_path}, "
136 |                         f"trying to load from model_id {model_id}"
137 |                     )
138 |                     config = AutoConfig.from_pretrained(
139 |                         model_id, **from_pretrained_kwargs
140 |                     )
141 |                 else:
142 |                     raise
143 | 
144 |             self._repo_root, self._checkpoints_json = self._generate_checkpoint_json(
145 |                 model_id
146 |             )
147 | 
148 |             with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
149 |                 model = AutoModelForCausalLM.from_config(config)
150 |         else:
151 |             try:
152 |                 model = AutoModelForCausalLM.from_pretrained(
153 |                     model_id_or_path, **from_pretrained_kwargs
154 |                 )
155 |             except OSError:
156 |                 if model_id_or_path != model_id:
157 |                     logger.warning(
158 |                         f"Couldn't load model from derived path {model_id_or_path}, "
159 |                         f"trying to load from model_id {model_id}"
160 |                     )
161 |                     model = AutoModelForCausalLM.from_pretrained(
162 |                         model_id, **from_pretrained_kwargs
163 |                     )
164 |                 else:
165 |                     raise
166 |         model.eval()
167 |         return model
168 | 
169 |     def postprocess_model(self, model: "PreTrainedModel") -> "PreTrainedModel":
170 |         from transformers import GPTNeoXForCausalLM, LlamaForCausalLM
171 | 
172 |         injection_policy = self.injection_policy
173 |         # TODO: remove those later when deepspeed master is updated
174 |         if injection_policy is None and not self.use_kernel:
175 |             if isinstance(model, GPTNeoXForCausalLM):
176 |                 from transformers import GPTNeoXLayer
177 | 
178 |                 injection_policy = {
179 |                     GPTNeoXLayer: ("attention.dense", "mlp.dense_4h_to_h")
180 |                 }
181 |             elif isinstance(model, LlamaForCausalLM):
182 |                 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
183 | 
184 |                 injection_policy = {
185 |                     LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")
186 |                 }
187 | 
188 |         if self.use_bettertransformer:
189 |             from optimum.bettertransformer import BetterTransformer
190 | 
191 |             logger.info("Transforming the model with BetterTransformer...")
192 |             model = BetterTransformer.transform(model)
193 | 
194 |         ds_kwargs = self.ds_inference_kwargs or {}
195 |         ds_kwargs = ds_kwargs.copy()
196 |         ds_kwargs.update(
197 |             dict(
198 |                 dtype=self.dtype,
199 |                 mp_size=self.world_size,
200 |                 replace_with_kernel_inject=self.use_kernel,
201 |                 injection_policy=injection_policy,
202 |                 max_tokens=self.max_tokens,
203 |             )
204 |         )
205 |         if self.use_meta_tensor:
206 |             ds_kwargs.update(
207 |                 dict(base_dir=self._repo_root, checkpoint=self._checkpoints_json)
208 |             )
209 | 
210 |         logger.info(f"deepspeed.init_inference kwargs: {ds_kwargs}")
211 |         model = deepspeed.init_inference(
212 |             model,
213 |             **ds_kwargs,
214 |         )
215 | 
216 |         if self.torch_compile and self.torch_compile["backend"]:
217 |             logger.info("Compiling the model with torch.compile()...")
218 |             model = torch.compile(model, **self.torch_compile)
219 | 
220 |         # Add attributes for compatibility with the pipeline
221 |         model.use_kernel = self.use_kernel
222 |         model.device = self.device
223 |         model = model.to(self.device)
224 |         return model
225 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/utils.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | import subprocess
  4 | import time
  5 | import traceback
  6 | from collections import defaultdict
  7 | from functools import wraps
  8 | from typing import List, Optional
  9 | 
 10 | from transformers import AutoConfig
 11 | import torch.distributed as dist
 12 | from filelock import FileLock
 13 | from ray.air.util.torch_dist import (
 14 |     ActorHandle,
 15 |     _get_node_and_gpu_ids,
 16 |     _init_torch_distributed,
 17 |     get_address_and_port,
 18 | )
 19 | from torch.hub import _get_torch_home
 20 | 
 21 | from llmadmin.backend.logger import get_logger
 22 | from llmadmin.backend.server.models import S3MirrorConfig
 23 | 
 24 | logger = get_logger(__name__)
 25 | 
 26 | 
 27 | def download_model(
 28 |     model_id: str,
 29 |     endpoint_url: str,
 30 |     bucket_uri: str,
 31 |     s3_sync_args: Optional[List[str]] = None,
 32 | ) -> None:
 33 |     """
 34 |     Download a model from an S3 bucket and save it in TRANSFORMERS_CACHE for
 35 |     seamless interoperability with Hugging Face's Transformers library.
 36 | 
 37 |     The downloaded model must have a 'hash' file containing the commit hash corresponding
 38 |     to the commit on Hugging Face Hub.
 39 |     """
 40 |     from transformers.utils.hub import TRANSFORMERS_CACHE
 41 |     
 42 |     isAutoLoadConfigSuccess = False
 43 |     modelConfig = None
 44 |     try:
 45 |         modelConfig = AutoConfig.from_pretrained(
 46 |             model_id, trust_remote_code=True)
 47 |         isAutoLoadConfigSuccess = True
 48 |     except Exception:
 49 |         isAutoLoadConfigSuccess = False
 50 |     
 51 |     if modelConfig and isAutoLoadConfigSuccess:
 52 |         logger.info(f"Model exist and success to load AutoConfig from_pretrained '{model_id}'")
 53 |         return
 54 |     else:
 55 |         logger.info(f"Fail to load AutoConfig from_pretrained '{model_id}'")
 56 |     
 57 |     s3_sync_args = s3_sync_args or []
 58 |     logger.info(f"Downloading '{model_id}' from '{bucket_uri}' to '{TRANSFORMERS_CACHE}'")
 59 |     path = os.path.expanduser(os.path.join(TRANSFORMERS_CACHE, f"models--{model_id.replace('/', '--')}"))
 60 |     
 61 |     isS3 = bucket_uri.startswith('s3://')
 62 |     if isS3:
 63 |         model_hash_file = os.path.join(bucket_uri, "hash")
 64 |         if endpoint_url:
 65 |             logger.info(f"Downloading '{model_id}' hash from server '{endpoint_url}' '{model_hash_file}' ")
 66 |             subprocess.run(["aws", "--endpoint-url", endpoint_url, "s3", "cp", "--quiet"] + s3_sync_args + [model_hash_file, "."])
 67 |         else:
 68 |             logger.info(f"Downloading '{model_id}' hash from '{model_hash_file}' ")
 69 |             subprocess.run(["aws", "s3", "cp", "--quiet"] + s3_sync_args + [model_hash_file, "."])
 70 |     else:
 71 |         model_hash_file = bucket_uri + "hash"
 72 |         logger.info(f"Downloading '{model_id}' hash from '{model_hash_file}' ")
 73 |         subprocess.run(["cp -rf " + model_hash_file + " ."], shell=True)
 74 |     
 75 |     if not os.path.exists(os.path.join(".", "hash")):
 76 |         raise RuntimeError("Hash file not found in the bucket or bucket could not have been downloaded.")
 77 |     
 78 |     with open(os.path.join(".", "hash"), "r") as f:
 79 |         f_hash = f.read().strip()
 80 |     
 81 |     model_cache_path = os.path.join(path, "snapshots", f_hash)
 82 | 
 83 |     model_config_file = os.path.join(model_cache_path, "config.json")
 84 |     if os.path.exists(model_config_file):
 85 |         logger.info(f"Skip download model '{model_id}' due to config '{model_config_file}' exist")
 86 |         return
 87 |     
 88 |     subprocess.run(["mkdir", "-p", model_cache_path])
 89 |     subprocess.run(["mkdir", "-p", os.path.join(path, "refs")])
 90 |     
 91 |     logger.info(f"Downloading '{model_id}' files from '{bucket_uri}' to '{model_cache_path}'")
 92 |     if isS3:
 93 |         if endpoint_url:
 94 |             subprocess.run([ "aws", "--endpoint-url", endpoint_url, "s3", "sync", "--quiet"] + s3_sync_args + [bucket_uri, model_cache_path])
 95 |         else:
 96 |             subprocess.run([ "aws", "s3", "sync", "--quiet"] + s3_sync_args + [bucket_uri, model_cache_path])
 97 |     else:
 98 |         subprocess.run(["cp -rf " + bucket_uri + "*" + " " + model_cache_path], shell=True)
 99 |         
100 |     with open(os.path.join(path, "refs", "main"), "w") as f:
101 |         f.write(f_hash)
102 | 
103 | def timeit(func):
104 |     """
105 |     Decorator to time a function.
106 |     """
107 | 
108 |     @wraps(func)
109 |     def inner(*args, **kwargs):
110 |         start_time = time.monotonic()
111 |         ret = func(*args, **kwargs)
112 |         time_taken = time.monotonic() - start_time
113 |         logger.info(f"LLM time counting fun {func} took {time_taken} s to complete")
114 |         return ret
115 | 
116 |     return inner
117 | 
118 | 
119 | def initialize_node(
120 |     model_id: Optional[str] = None,
121 |     s3_mirror_config: Optional[S3MirrorConfig] = None,
122 | ):
123 |     """
124 |     Performn initialization for a node.
125 | 
126 |     Currently, that means downloading the model from the S3 bucket.
127 |     """
128 |     # Create the torch cache kernels directory if it doesn't exist.
129 |     # This is a workaround for a torch issue, where the kernels directory
130 |     # cannot be created by torch if the parent directory doesn't exist.
131 |     torch_cache_home = _get_torch_home()
132 |     os.makedirs(os.path.join(torch_cache_home, "kernels"), exist_ok=True)
133 | 
134 |     if model_id and s3_mirror_config and s3_mirror_config.bucket_uri:
135 |         lock_path = os.path.expanduser(f"~/{model_id.replace('/', '--')}.lock")
136 |         try:
137 |             # Timeout 0 means there will be only one attempt to acquire
138 |             # the file lock. If it cannot be aquired, a TimeoutError
139 |             # will be thrown.
140 |             # This allows us to make sure that subsequent processes don't
141 |             # duplicate work.
142 |             with FileLock(lock_path, timeout=0):
143 |                 endpoint_url = s3_mirror_config.endpoint_url
144 |                 bucket_uri = s3_mirror_config.bucket_uri
145 |                 s3_sync_args = s3_mirror_config.s3_sync_args
146 |                 try:
147 |                     download_model(model_id, endpoint_url, bucket_uri, s3_sync_args=s3_sync_args)
148 |                     logger.info("Done downloading the model from bucket!")
149 |                 except RuntimeError:
150 |                     logger.warning(
151 |                         f"Unable to download the model from bucket. Traceback:\n {traceback.format_exc()}"
152 |                     )
153 |         except TimeoutError:
154 |             # if the directory is already locked, then wait but do not do anything.
155 |             with FileLock(lock_path, timeout=-1):
156 |                 pass
157 | 
158 | 
159 | def merge_dicts(overwrite: dict, base: dict) -> dict:
160 |     """
161 |     Merge two dictionaries recursively, with keys from overwrite taking precedence.
162 |     """
163 |     base = base.copy()
164 |     for key, value in overwrite.items():
165 |         if isinstance(value, dict):
166 |             # get node or create one
167 |             node = base.setdefault(key, {})
168 |             merge_dicts(value, node)
169 |         else:
170 |             base[key] = value
171 | 
172 |     return base
173 | 
174 | 
175 | async def init_torch_dist_process_group_async(
176 |     workers: List[ActorHandle],
177 |     backend: str = "gloo",
178 |     init_method: str = "env",
179 | ) -> List[int]:
180 |     """Initialize a torch distributed process group asynchronously.
181 | 
182 |     This is identical to
183 |     ``ray.air.util.torch_dist.init_torch_dist_process_group``
184 |     but uses asyncio to avoid blocking the event loop.
185 | 
186 |     Note: this util assumes that the order of the workers passed in
187 |     are their global ranks.
188 | 
189 |     Args:
190 |         workers: A list of TorchDistributedWorker actors.
191 |         backend: The torch distributed backend to use,
192 |             possible choices are "gloo" or "nccl".
193 |         init_method: The initialization method to use,
194 |             possible choices are "env" or "tcp".
195 | 
196 |     Returns:
197 |         Local ranks on their respective nodes for the list of workers.
198 |     """
199 |     if not dist.is_available():
200 |         raise RuntimeError("Distributed torch is not available.")
201 | 
202 |     # Build a map from node_id to workers on that node.
203 |     node_and_gpu_ids = await asyncio.gather(
204 |         *[w.execute.remote(_get_node_and_gpu_ids) for w in workers]
205 |     )
206 |     # All the workers on a specific node.
207 |     node_to_workers = defaultdict(list)
208 |     # All the gpu ids visible to all the workers on a specific node.
209 |     node_to_gpu_ids = defaultdict(set)
210 |     for i, (node_id, gpu_ids) in enumerate(node_and_gpu_ids):
211 |         node_to_workers[node_id].append(i)
212 |         # Force list.
213 |         if not isinstance(gpu_ids, list):
214 |             gpu_ids = [gpu_ids]
215 |         # It is possible for a worker to have access to multiple GPUs.
216 |         for gpu_id in gpu_ids:
217 |             node_to_gpu_ids[node_id].add(gpu_id)
218 | 
219 |     # Assume the first worker is the master.
220 |     master_addr, master_port = (
221 |         await asyncio.gather(workers[0].execute.remote(get_address_and_port))
222 |     )[0]
223 | 
224 |     setup_futures = []
225 |     world_size = len(workers)
226 |     local_ranks = []
227 |     for rank, worker in enumerate(workers):
228 |         node_id = node_and_gpu_ids[rank][0]
229 |         local_rank = node_to_workers[node_id].index(rank)
230 |         local_world_size = len(node_to_workers[node_id])
231 |         setup_futures.append(
232 |             worker.execute.remote(
233 |                 _init_torch_distributed,
234 |                 init_method=init_method,
235 |                 backend=backend,
236 |                 rank=rank,
237 |                 world_size=world_size,
238 |                 local_rank=local_rank,
239 |                 local_world_size=local_world_size,
240 |                 master_addr=master_addr,
241 |                 master_port=master_port,
242 |                 # list(set) will sort the gpu ids, so VISIBLE_CUDA_DEVICES
243 |                 # is always sorted.
244 |                 gpu_ids=list(node_to_gpu_ids[node_id]),
245 |             )
246 |         )
247 |         local_ranks.append(local_rank)
248 | 
249 |     # Wait for all workers to join the process group.
250 |     await asyncio.gather(*setup_futures)
251 | 
252 |     return local_ranks
253 | 


--------------------------------------------------------------------------------
/llmadmin/api/cli.py:
--------------------------------------------------------------------------------
  1 | # import ast
  2 | # import json
  3 | from typing import Annotated, Optional
  4 | 
  5 | import typer
  6 | # from rich import print as rp
  7 | # from rich.console import Console
  8 | # from rich.progress import Progress, SpinnerColumn, TextColumn
  9 | # from rich.table import Table
 10 | from ray.serve._private.constants import DEFAULT_HTTP_PORT
 11 | from llmadmin.api import sdk
 12 | 
 13 | app = typer.Typer()
 14 | 
 15 | model_type = typer.Option(
 16 |     default=..., help="The model to use. You can specify multiple models."
 17 | )
 18 | 
 19 | ft_define = typer.Option(
 20 |     default=..., help="the fine tune yaml file"
 21 | )
 22 |     
 23 | app_name = typer.Option(
 24 |     default=..., help="The name of ray serve application."
 25 | )
 26 | host = typer.Option(
 27 |     default=..., help="The host ip address of ray api server."
 28 | )
 29 | port = typer.Option(
 30 |     default=...,help="The port of api server."
 31 | )
 32 | prompt_type = typer.Option(help="Prompt to query")
 33 | stats_type = typer.Option(help="Whether to print generated statistics")
 34 | prompt_file_type = typer.Option(
 35 |     default=..., help="File containing prompts. A simple text file"
 36 | )
 37 | separator_type = typer.Option(help="Separator used in prompt files")
 38 | results_type = typer.Option(help="Where to save the results")
 39 | file_type = typer.Option(default=..., help="The flow graph")
 40 | port_type = typer.Option(default=..., help="The port of service.")
 41 | apiserver_scale_type = typer.Option(default=..., help="A string of dict for scaling service. for example: --scale-config=min_replicas=1,max_replicas=5")
 42 | apiserver_resource_type = typer.Option(default=..., help="A string of dict for resource requirement. for example: --resource-config=num_cpus=1")
 43 | 
 44 | @app.command()
 45 | def start_apiserver(
 46 |     port: Annotated[Optional[int], port_type] = DEFAULT_HTTP_PORT,
 47 |     resource_config: Annotated[str, apiserver_resource_type] = None,
 48 |     scale_config: Annotated[str, apiserver_scale_type] = None
 49 | ):
 50 |     """Start a api server, it will provide apis.
 51 |     Args:
 52 |         *host: The host ip to run.
 53 |         *port: The port to run.
 54 |     """
 55 |     sdk.start_apiserver(port=port, resource_config=resource_config, scale_config=scale_config)
 56 | 
 57 | @app.command()
 58 | def run_ft(ft: Annotated[str, ft_define]):
 59 |     """Start a fine tune process.
 60 | 
 61 |     Args:
 62 |         *model: The model to run.
 63 |     """
 64 |     sdk.run_ft(ft)
 65 | 
 66 | @app.command()
 67 | def ray_ft(model: Annotated[str, ft_define]):
 68 |     """Start a fine tune ray process.
 69 | 
 70 |     Args:
 71 |         *model: The model to run.
 72 |     """
 73 |     sdk.run_ray_ft(model)
 74 | 
 75 | # @app.command()
 76 | # def list_models(metadata: Annotated[bool, "Whether to print metadata"] = False):
 77 | #     """Get a list of the available models"""
 78 | #     result = sdk.models()
 79 | #     if metadata:
 80 | #         for model in result:
 81 | #             rp(f"[bold]{model}:[/]")
 82 | #             rp(sdk.metadata(model))
 83 | #     else:
 84 | #         print("\n".join(result))
 85 | 
 86 | 
 87 | # def _print_result(result, model, print_stats):
 88 | #     rp(f"[bold]{model}:[/]")
 89 | #     if print_stats:
 90 | #         rp("[bold]Stats:[/]")
 91 | #         rp(result)
 92 | #     else:
 93 | #         rp(result)
 94 | 
 95 | 
 96 | # def progress_spinner():
 97 | #     return Progress(
 98 | #         SpinnerColumn(),
 99 | #         TextColumn("[progress.description]{task.description}"),
100 | #         transient=True,
101 | #     )
102 | 
103 | 
104 | # @app.command()
105 | # def query(
106 | #     model: Annotated[List[str], model_type],
107 | #     prompt: Annotated[Optional[List[str]], prompt_type] = None,
108 | #     prompt_file: Annotated[Optional[str], prompt_file_type] = None,
109 | #     separator: Annotated[str, separator_type] = "----",
110 | #     output_file: Annotated[str, results_type] = "llmadmin-output.json",
111 | #     print_stats: Annotated[bool, stats_type] = False,
112 | # ):
113 | #     """Query one or several models with one or multiple prompts,
114 | #     optionally read from file, and save the results to a file."""
115 | #     with progress_spinner() as progress:
116 | #         if prompt_file:
117 | #             with open(prompt_file, "r") as f:
118 | #                 prompt = f.read().split(separator)
119 | 
120 | #         results = {p: [] for p in prompt}
121 | 
122 | #         for m in model:
123 | #             progress.add_task(
124 | #                 description=f"Processing all prompts against model: {m}.",
125 | #                 total=None,
126 | #             )
127 | #             query_results = sdk.batch_query(m, prompt)
128 | #             for result in query_results:
129 | #                 _print_result(result, m, print_stats)
130 | 
131 | #             for i, p in enumerate(prompt):
132 | #                 result = query_results[i]
133 | #                 text = result
134 | #                 # del result["generated_text"]
135 | #                 results[p].append({"model": m, "result": text, "stats": result})
136 | 
137 | #         progress.add_task(description="Writing output file.", total=None)
138 | #         with open(output_file, "w") as f:
139 | #             f.write(json.dumps(results, indent=2))
140 | 
141 | 
142 | # @app.command(deprecated=True, name="batch_query")
143 | # def batch_query(
144 | #     model: Annotated[List[str], model_type],
145 | #     prompt: Annotated[List[str], prompt_type],
146 | #     print_stats: Annotated[bool, stats_type] = False,
147 | # ):
148 | #     """Query a model with a batch of prompts."""
149 | #     with progress_spinner() as progress:
150 | #         for m in model:
151 | #             progress.add_task(
152 | #                 description=f"Processing prompt against {m}...", total=None
153 | #             )
154 | #             results = sdk.batch_query(m, prompt)
155 | #             for result in results:
156 | #                 _print_result(result, m, print_stats)
157 | 
158 | 
159 | # @app.command(deprecated=True, name="multi_query")
160 | # def multi_query(
161 | #     model: Annotated[List[str], model_type],
162 | #     prompt_file: Annotated[str, prompt_file_type],
163 | #     separator: Annotated[str, separator_type] = "----",
164 | #     output_file: Annotated[str, results_type] = "llmadmin-output.json",
165 | # ):
166 | #     """Query one or multiple models with a batch of prompts taken from a file."""
167 | 
168 | #     with progress_spinner() as progress:
169 | #         progress.add_task(
170 | #             description=f"Loading your prompts from {prompt_file}.", total=None
171 | #         )
172 | #         with open(prompt_file, "r") as f:
173 | #             prompts = f.read().split(separator)
174 | #         results = {prompt: [] for prompt in prompts}
175 | 
176 | #         for m in model:
177 | #             progress.add_task(
178 | #                 description=f"Processing all prompts against model: {model}.",
179 | #                 total=None,
180 | #             )
181 | #             query_results = sdk.batch_query(m, prompts)
182 | #             for i, prompt in enumerate(prompts):
183 | #                 result = query_results[i]
184 | #                 text = result["generated_text"]
185 | #                 del result["generated_text"]
186 | #                 results[prompt].append({"model": m, "result": text, "stats": result})
187 | 
188 | #         progress.add_task(description="Writing output file.", total=None)
189 | #         with open(output_file, "w") as f:
190 | #             f.write(json.dumps(results, indent=2))
191 | 
192 | 
193 | # evaluator_type = typer.Option(help="Which LLM to use for evaluation")
194 | 
195 | 
196 | # @app.command()
197 | # def run(model: Annotated[List[str], model_type]):
198 | #     """Start a model.
199 | 
200 | #     Args:
201 | #         *model: The model to run.
202 | #     """
203 | #     sdk.run(*model)
204 | 
205 | # @app.command()
206 | # def run_experimental(model: Annotated[List[str], model_type]):
207 | #     """Start a model for experimental, it will do inference by transformer pipeline.
208 | 
209 | #     Args:
210 | #         *model: The model to run.
211 | #     """
212 | #     sdk.run_experimental(*model)
213 | 
214 | # @app.command()
215 | # def del_serve(appname: Annotated[str, app_name]):
216 | #     """Remove a ray serve.
217 | 
218 | #     Args:
219 | #         *model: The model to run.
220 | #     """
221 | #     sdk.del_experimental(appname)
222 | 
223 | # @app.command()
224 | # def run_application(file: Annotated[str, file_type]):
225 | #     """Start a model in LLMAdmin for experimental.
226 | 
227 | #     Args:
228 | #         *model: The model to run.
229 | #     """
230 | #     from pathlib import Path
231 | #     # If input is a file path, load JSON from the file
232 | #     if isinstance(file, (str, Path)):
233 | #         with open(file, "r", encoding="utf-8") as f:
234 | #             flow_graph = json.load(f)
235 | #     else:
236 | #         raise TypeError(
237 | #             "Input must be a file path (str)"
238 | #         )
239 | #     sdk.run_application(flow_graph)
240 | 
241 | 
242 | 
243 | # @app.command()
244 | # def run_comparation():
245 | #     """Start frontend for model comparation.
246 | 
247 | #     Args:
248 | #         *model: The model to run.
249 | #     """
250 | #     sdk.run_comparation()
251 | 
252 | # @app.command()
253 | # def evaluate(
254 | #     input_file: Annotated[str, results_type] = "llmadmin-output.json",
255 | #     evaluation_file: Annotated[str, results_type] = "evaluation-output.json",
256 | #     evaluator: Annotated[str, evaluator_type] = "gpt-4",
257 | # ):
258 | #     """Evaluate and summarize the results of a multi_query run with a strong
259 | #     'evaluator' LLM like GPT-4.
260 | #     The results of the ranking are stored to file and displayed in a table.
261 | #     """
262 | #     with progress_spinner() as progress:
263 | #         progress.add_task(description="Loading the evaluator LLM.", total=None)
264 | #         if evaluator == "gpt-4":
265 | #             from llmadmin.common.evaluation import GPT
266 | 
267 | #             eval_model = GPT()
268 | #         else:
269 | #             raise NotImplementedError(f"No evaluator for {evaluator}")
270 | 
271 | #         with open(input_file, "r") as f:
272 | #             results = json.load(f)
273 | 
274 | #         for prompt, result_list in results.items():
275 | #             progress.add_task(
276 | #                 description=f"Evaluating results for prompt: {prompt}.", total=None
277 | #             )
278 | #             evaluation = eval_model.evaluate_results(prompt, result_list)
279 | #             try:
280 | #                 # GPT-4 returns a string with a Python dictionary, hopefully!
281 | #                 evaluation = ast.literal_eval(evaluation)
282 | #             except Exception:
283 | #                 print(f"Could not parse evaluation: {evaluation}")
284 | 
285 | #             for i, _res in enumerate(results[prompt]):
286 | #                 results[prompt][i]["rank"] = evaluation[i]["rank"]
287 | 
288 | #         progress.add_task(description="Storing evaluations.", total=None)
289 | #         with open(evaluation_file, "w") as f:
290 | #             f.write(json.dumps(results, indent=2))
291 | 
292 | #     for prompt in results.keys():
293 | #         table = Table(title="Evaluation results (higher ranks are better)")
294 | 
295 | #         table.add_column("Model", justify="left", style="cyan", no_wrap=True)
296 | #         table.add_column("Rank", style="magenta")
297 | #         table.add_column("Response", justify="right", style="green")
298 | 
299 | #         for i, _res in enumerate(results[prompt]):
300 | #             model = results[prompt][i]["model"]
301 | #             response = results[prompt][i]["result"]
302 | #             rank = results[prompt][i]["rank"]
303 | #             table.add_row(model, str(rank), response)
304 | 
305 | #         console = Console()
306 | #         console.print(table)
307 | 
308 | 
309 | if __name__ == "__main__":
310 |     app()
311 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2023 Anyscale
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/llmadmin/backend/llm/ft/transformer.py:
--------------------------------------------------------------------------------
  1 | from ._base import BaseFT
  2 | from abc import ABC, abstractmethod
  3 | from llmadmin.backend.logger import get_logger
  4 | # from datasets import DatasetDict, Dataset, IterableDatasetDict, IterableDataset
  5 | # from typing import Union
  6 | from llmadmin.backend.server.models import FTApp
  7 | from datasets import load_dataset
  8 | from datasets import load_metric
  9 | import pandas as pd
 10 | # from ray.data.preprocessors import BatchMapper
 11 | # import ray
 12 | import torch
 13 | from transformers import TrainingArguments, Trainer
 14 | # import numpy as np
 15 | # from ray.train.huggingface import TransformersTrainer
 16 | # from ray.air.config import RunConfig, CheckpointConfig
 17 | from .utils import parse_task_name 
 18 | from .tasks import TASK_REGISTRY
 19 | from .tasks._base import Task
 20 | from .methods.base import get_train_model
 21 | # from ray.train.huggingface import TransformersCheckpoint
 22 | from .const import CHECKPOINT_PATH
 23 | from .callback import CustomCallback
 24 | 
 25 | from llmadmin.backend.llm.utils import initialize_node
 26 | 
 27 | logger = get_logger(__name__)
 28 | 
 29 | class TransformersFT(BaseFT):
 30 |     def __init__(self, ftApp: FTApp):
 31 |         super().__init__(ftapp=ftApp)
 32 |     
 33 |     def train(self):
 34 |         self.trainV2()
 35 |     
 36 |     # Transformer train only
 37 |     def trainV2(self):
 38 |         taskobj: Task = None
 39 |         task = parse_task_name(self.ftapp)
 40 |         logger.info(f"TransformersFT.trainV2 finetune task name: '{task}'")
 41 |         taskcls = TASK_REGISTRY[task]
 42 | 
 43 |         if not taskcls:
 44 |             logger.error(f"Couldn't load defined task from register: '{task}'")
 45 |             raise
 46 |         
 47 |         logger.info("Start initializing finetune node tasks")
 48 |         initialize_node(self.model_config.model_id, self.model_config.initialization.s3_mirror_config)
 49 |         logger.info(f"Start loading tokenizer for finetune {self.model_config.model_id}")
 50 |         # self.model_config.model_id = '/root/.cache/huggingface/hub/ZhipuAI/chatglm3-6b/'
 51 |         # self.model_config.model_id = '/data/hhwang/models/chatglm2-6b/'
 52 |         tokenizer = self.initializer.load_tokenizer(self.model_config.model_id)
 53 |         if self.model_config.add_special_tokens:
 54 |             add_special_tokens = self.model_config.add_special_tokens
 55 |             if add_special_tokens.get("pad_token"):
 56 |                 tokenizer.pad_token = add_special_tokens.get("pad_token")
 57 |             if add_special_tokens.get("eos_token"):
 58 |                 tokenizer.eos_token = add_special_tokens.get("eos_token")
 59 |         logger.info(f"Initialize {taskcls} and load dataset")
 60 |         # logger.info(f"Initialize {taskcls} and load dataset for model {self.model_config.model_id}")
 61 |         taskobj = taskcls.from_tokenizer(tokenizer, self.ftapp.ft_config)
 62 |         logger.info(f"Load model {self.model_config.model_id} by {taskobj.AUTO_MODEL_CLASS}")
 63 |         from_pretrained_kwargs = taskobj.FROM_PRETRAINED_KWARGS if taskobj.FROM_PRETRAINED_KWARGS else {}
 64 |         model = self.initializer.load_model(self.model_config.model_id, taskobj.AUTO_MODEL_CLASS, **from_pretrained_kwargs)
 65 |         if self.model_config.quantization_bit is not None:
 66 |             print(f"Quantized to {self.model_config.quantization_bit} bit")
 67 |             model = model.quantize(self.model_config.quantization_bit)
 68 |         
 69 |         taskobj.set_model(model)
 70 |         
 71 |         # preprocess_function = taskobj.get_data_proprocess()
 72 |         # compute_metrics_function = taskobj.get_compute_metrics()
 73 |         data_collator = taskobj.get_data_collator()
 74 |         # batch_encoder = BatchMapper(preprocess_function, batch_format="pandas")
 75 |         
 76 |         data_config = self.ftapp.ft_config.data_config
 77 |         use_gpu = True if torch.cuda.is_available() else False
 78 |         use_mps = True if torch.backends.mps.is_available() else False
 79 |         logger.info(f"use_gpu: {use_gpu}, use_cpu: {not use_gpu}, use_mps: {use_mps}")
 80 |  
 81 |         logger.info(f"Finetune get train and validation dataset")
 82 |         if data_config.num_row > 0:
 83 |             # only for test purpose
 84 |             train_dataset = taskobj.getSmallTrainDataSet(data_config.num_row)
 85 |             eval_dataset = taskobj.getSmallEvalDataSet(data_config.num_row)
 86 |         else:
 87 |             # For train
 88 |             train_dataset = taskobj.getTrainDataSet()
 89 |             eval_dataset = taskobj.getEvalDataSet()
 90 |         
 91 |         logger.info(f"Finetune train dataset {train_dataset}")
 92 |         logger.info(f"Finetune eval dataset {eval_dataset}")
 93 |         
 94 |         if hasattr(model, "is_parallelizable"):
 95 |             logger.info(f"model.is_parallelizable = {model.is_parallelizable}")
 96 |             
 97 |         if hasattr(model, "model_parallel"):
 98 |             logger.info(f"model.model_parallel = {model.model_parallel}")
 99 |         
100 |         if getattr(model, "hf_device_map", None) is not None:
101 |             logger.info(f"model.hf_device_map is {model.hf_device_map}")
102 |         
103 |         ftConfig = self.ftapp.ft_config.train_config.base_config
104 |         model_name = self.model_config.model_id.split("/")[-1]
105 |         task_name = self.ft_task
106 |         outputDir = f"{ftConfig.checkpoints_output_dir}/{model_name}-finetuned-{task_name}-{data_config.data_path}-{data_config.subset}"
107 |         logger.info(f"Finetune checkpoints output dir: {outputDir}")
108 |         args = TrainingArguments(
109 |             outputDir,
110 |             evaluation_strategy=ftConfig.evaluation_strategy,
111 |             save_strategy=ftConfig.save_strategy,
112 |             logging_strategy=ftConfig.logging_strategy,
113 |             logging_steps = 2,
114 |             save_steps = ftConfig.save_steps,
115 |             eval_steps = 2,
116 |             learning_rate=ftConfig.learning_rate,
117 |             per_device_train_batch_size=ftConfig.per_device_train_batch_size,
118 |             per_device_eval_batch_size=ftConfig.per_device_eval_batch_size,
119 |             num_train_epochs=ftConfig.num_train_epochs,
120 |             weight_decay=ftConfig.weight_decay,
121 |             push_to_hub=False,
122 |             disable_tqdm=False,  # declutter the output a little
123 |             use_cpu=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
124 |             remove_unused_columns=ftConfig.remove_unused_columns,
125 |         )
126 |         trainConfig = self.ftapp.ft_config.train_config
127 |         ftMethod = self.ftapp.ft_config.ft_method
128 |         model = get_train_model(model, ftMethod, trainConfig)
129 |         trainer = Trainer(
130 |         # trainer = Seq2SeqTrainer(
131 |             model,
132 |             args,
133 |             train_dataset=train_dataset,
134 |             eval_dataset=eval_dataset,
135 |             tokenizer=tokenizer,
136 |             # compute_metrics=compute_metrics_function,
137 |             data_collator=data_collator,
138 |         )
139 |         trainer.add_callback(CustomCallback(trainer))
140 |         logger.info("Starting training")
141 |         trainResult = trainer.train()
142 |         logger.info(f"Train result {trainResult}")
143 |         trainer.save_model()
144 |         logger.info(f"Save model to {trainer.args.output_dir}")
145 |         logger.info("Done training")
146 |         
147 |     # depend on ray for distribution    
148 |     # def trainV1(self):
149 |     #     taskobj: Task = None
150 |     #     task = parse_task_name(self.ftapp)
151 |     #     logger.info(f"TransformersFT.trainV1 finetune task name {task}")
152 |     #     taskcls = TASK_REGISTRY[task]
153 | 
154 |     #     if not taskcls:
155 |     #         logger.error(f"Couldn't load defined task from register: {task}")
156 |     #         raise
157 |         
158 |     #     logger.info("Starting initialize Finetune node tasks")
159 |     #     initialize_node(self.model_config.model_id, self.model_config.initialization.s3_mirror_config)
160 |         
161 |     #     tokenizer = self.initializer.load_tokenizer(self.model_config.model_id)
162 |     #     logger.info("Done load tokenizer for finetune")
163 |         
164 |     #     taskobj = taskcls.from_tokenizer(tokenizer, self.ftapp.ft_config)
165 |         
166 |     #     from_pretrained_kwargs = taskobj.FROM_PRETRAINED_KWARGS if taskobj.FROM_PRETRAINED_KWARGS else {}
167 |     #     model = self.initializer.load_model(self.model_config.model_id, taskobj.AUTO_MODEL_CLASS, **from_pretrained_kwargs)
168 |     #     taskobj.set_model(model)
169 | 
170 |     #     preprocess_function = taskobj.get_data_proprocess()
171 |     #     compute_metrics_function = taskobj.get_compute_metrics()
172 |     #     data_collator = taskobj.get_data_collator()
173 |     #     batch_encoder = BatchMapper(preprocess_function, batch_format="pandas")
174 |         
175 |     #     ray_datasets = ray.data.from_huggingface(taskobj.get_dataset())
176 |     #     model_name = self.model_config.model_id.split("/")[-1]
177 |     #     task = self.ft_task
178 |     #     name = f"{model_name}-finetuned-{task}"
179 |     #     use_gpu = True if torch.cuda.is_available() else False
180 | 
181 |     #     def trainer_init_per_worker(train_dataset, eval_dataset = None, **config):
182 |     #         print(f"Is CUDA available: {torch.cuda.is_available()}")
183 | 
184 |     #         args = TrainingArguments(
185 |     #             name,
186 |     #             evaluation_strategy=config.get("evaluation_strategy", "epoch"),
187 |     #             save_strategy=config.get("save_strategy", "epoch"),
188 |     #             logging_strategy=config.get("logging_strategy", "epoch"),
189 |     #             logging_steps = 2,
190 |     #             save_steps = 500,
191 |     #             eval_steps = 2,
192 |     #             learning_rate=config.get("learning_rate", 2e-5),
193 |     #             per_device_train_batch_size=config.get("per_device_train_batch_size", 16),
194 |     #             per_device_eval_batch_size=config.get("per_device_train_batch_size", 16),
195 |     #             num_train_epochs=config.get("epochs", 2),
196 |     #             weight_decay=config.get("weight_decay", 0.01),
197 |     #             push_to_hub=False,
198 |     #             disable_tqdm=False,  # declutter the output a little
199 |     #             no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
200 |     #             remove_unused_columns=config.get("remove_unused_columns", True),
201 |     #             fp16=True,
202 |     #         )
203 | 
204 |     #         trainer = Trainer(
205 |     #             model,
206 |     #             args,
207 |     #             train_dataset=train_dataset,
208 |     #             eval_dataset=eval_dataset,
209 |     #             tokenizer=tokenizer,
210 |     #             compute_metrics=compute_metrics_function,
211 |     #             data_collator=data_collator,
212 |     #         )
213 |     #         trainer.add_callback(CustomCallback(trainer))
214 |     #         print("Starting training")
215 | 
216 |     #         return trainer
217 |         
218 |     #     trainer = TransformersTrainer(
219 |     #         trainer_init_per_worker=trainer_init_per_worker,
220 |     #         trainer_init_config = self.train_conf.get_train_kwargs(),
221 |     #         scaling_config=self.scale_config.as_air_scaling_config(),
222 |     #         datasets={
223 |     #             "train": ray_datasets[taskobj.training_key()],
224 |     #             "evaluation": ray_datasets[taskobj.validation_key()],
225 |     #         },
226 |     #         run_config=RunConfig(
227 |     #             # callbacks=[MLflowLoggerCallback(experiment_name=name)],
228 |     #             checkpoint_config=CheckpointConfig(
229 |     #                 num_to_keep=1,
230 |     #                 checkpoint_score_attribute="eval_loss",
231 |     #                 checkpoint_score_order="min",
232 |     #             ),
233 |     #         ),
234 |     #         preprocessor=batch_encoder,
235 |     #     )
236 | 
237 |     #     result = trainer.fit()
238 |     #     print(result)
239 |     #     checkpoint = TransformersCheckpoint.from_checkpoint(result.checkpoint)
240 |     #     hf_trainer = checkpoint.get_model(model=taskobj.AUTO_MODEL_CLASS)
241 |     #     hf_trainer.save_pretrained(CHECKPOINT_PATH)
242 |     #     tokenizer.save_pretrained(CHECKPOINT_PATH)
243 | 
244 |     #     print("Done")
245 | 
246 | 
247 | 
248 | 
249 | 


--------------------------------------------------------------------------------