├── docker-compose.yml
├── dockerfile
├── dockerfile-china
└── readme.md


/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | 
 3 | services:
 4 |   # Worker 控制器，实现Worker 的分布式调度
 5 |   fastchat-controller:
 6 |     image: fastchat:latest
 7 |     ports:
 8 |       - "21001:21001"
 9 |       - "7861:7861"
10 |     entrypoint: ["python3.9", "-m", "fastchat.serve.controller", "--host", "0.0.0.0", "--port", "21001"]
11 |     depends_on:
12 |       - fastchat-worker-qwen
13 |       - fastchat-worker-baichuan
14 | 
15 |   # WebUI
16 |   fastchat-webui:
17 |     image: fastchat:latest
18 |     ports:
19 |       - "7860:7860"
20 |     depends_on:
21 |       fastchat-controller:
22 |         condition: service_started
23 |     entrypoint: ["python3.9", "-m", "fastchat.serve.gradio_web_server_multi", "--model-list-mode", "reload", "--controller-url", "http://fastchat-controller:21001", "--host", "0.0.0.0", "--port", "7860"]
24 | 
25 |   # OpenAi 兼容接口
26 |   fastchat-api-server:
27 |     image: fastchat:latest
28 |     ports:
29 |       - "8000:8000"
30 |     depends_on:
31 |       - fastchat-controller
32 |     entrypoint: ["python3.9", "-m", "fastchat.serve.openai_api_server", "--controller-address", "http://fastchat-controller:21001", "--host", "0.0.0.0", "--port", "8000"]
33 | 
34 |   #千问
35 |   fastchat-worker-qwen:
36 |     volumes:
37 |       - /files/huggingface:/files/huggingface
38 |     image: fastchat:latest
39 |     deploy:
40 |       resources:
41 |         reservations:
42 |           devices:
43 |             - driver: nvidia
44 |               device_ids: ['1']
45 |               capabilities: [gpu]
46 |     entrypoint: ["python3.9", "-m", "fastchat.serve.model_worker", "--model-names=Qwen-7B-Chat", "--model-path=/files/huggingface/Qwen-7B-Chat", "--worker-address=http://fastchat-worker-qwen:21002", "--controller-address=http://fastchat-controller:21001", "--host=0.0.0.0", "--port=21002", "--device=cuda", "--gpus=0", "--num-gpus=1", "--max-gpu-memory=22GB"]
47 | 
48 |   #百川
49 |   fastchat-worker-baichuan:
50 |     volumes:
51 |       - /files/huggingface:/files/huggingface
52 |     image: fastchat:latest
53 |     deploy:
54 |       resources:
55 |         reservations:
56 |           devices:
57 |             - driver: nvidia
58 |               device_ids: ['2', '3']
59 |               capabilities: [gpu]
60 |     entrypoint: ["python3.9", "-m", "fastchat.serve.model_worker", "--model-names=Baichuan-13B-Chat", "--model-path=/files/huggingface/Baichuan-13B-Chat", "--worker-address=http://fastchat-worker-baichuan:21003", "--controller-address=http://fastchat-controller:21001", "--host=0.0.0.0", "--port=21003", "--device=cuda", "--num-gpus=2", "--max-gpu-memory=24GB"]
61 | 


--------------------------------------------------------------------------------
/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.7.1-runtime-ubuntu20.04
 2 | 
 3 | RUN apt-get update -y && apt-get install -y python3.9 python3.9-distutils curl
 4 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 5 | RUN python3.9 get-pip.py
 6 | RUN pip3 install plotly einops transformers_stream_generator
 7 | 
 8 | WORKDIR /data/fschat
 9 | ADD . .
10 | 
11 | RUN pip3 install -e ".[model_worker, webui]"
12 | 


--------------------------------------------------------------------------------
/dockerfile-china:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.7.1-runtime-ubuntu20.04
 2 | 
 3 | RUN apt-get update -y && apt-get install -y python3.9 python3.9-distutils curl
 4 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 5 | RUN python3.9 get-pip.py -i https://pypi.tuna.tsinghua.edu.cn/simple
 6 | RUN pip3 install plotly einops transformers_stream_generator -i https://pypi.tuna.tsinghua.edu.cn/simple
 7 | 
 8 | WORKDIR /data/fschat
 9 | ADD . .
10 | 
11 | RUN pip3 install -e ".[model_worker, webui]" -i https://pypi.tuna.tsinghua.edu.cn/simple
12 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # 使用 `FastChat` 运行 `Baichuan-13B-Chat` 和 `Qwen-7B-Chat`
  2 | 
  3 | 为了对比`Baichuan-13B-Chat` 和 `Qwen-7B-Chat`的效果，计划采用`FastChat`的`Chatbot Arena`实现。
  4 | 
  5 | 官方的 `dockerfile` 只包含 `controller` 和 `worker`，所以我单独做了一套，包含 `controller`, `worker`, `api`, `webui`。
  6 | 
  7 | 目前还未确认能支持 `vllm`，努力中。
  8 | 
  9 | ## step 1, 官方源码下载
 10 | 
 11 | docker源码地址
 12 | 
 13 | > https://github.com/lm-sys/FastChat
 14 | 
 15 | 文档地址
 16 | 
 17 | > https://www.4wei.cn/archives/1003130
 18 | 
 19 | ## step 1, FastChat 的源码安装，构建镜像
 20 | 
 21 | > 官方源码地址，可通过 git 或者手工下载
 22 | 
 23 | `https://github.com/lm-sys/FastChat`
 24 | 
 25 | 虽然可以通过 `pip3 install fschat` 来快速安装，但为了使用最新版本，且在 docker 中使用，需要手动构建镜像。
 26 | 
 27 | ## step 2, 镜像构建
 28 | 
 29 | ```shell
 30 | 
 31 | # step 1 中实现
 32 | # git clone https://github.com/lm-sys/FastChat
 33 | cd FastChat
 34 | git clone github.com/forcemeter/fastchat-docker 4wei
 35 | docker build . -f 4wei/dockerfile -t fastchat:latest
 36 | 
 37 | # 如果是在中国，可以使用中国源
 38 | #docker build . -f 4wei/dockerfile-china -t fastchat:latest
 39 | ```
 40 | 
 41 | ## step 3, 检查显卡配置
 42 | 
 43 | 首先检查 GPU 是否满足需求，能同时运行这两个模型，至少需要 `60G` 显存，我使用了 4块 * 24G的 `P40`，所以在使用 docker 的时候，要进行分配。
 44 | 
 45 | ```text
 46 | (FastChat) [root@lm-sys-01 FastChat]# nvidia-smi
 47 | Fri Aug 11 23:31:24 2023
 48 | +-----------------------------------------------------------------------------+
 49 | | NVIDIA-SMI 515.105.01   Driver Version: 515.105.01   CUDA Version: 11.7     |
 50 | |-------------------------------+----------------------+----------------------+
 51 | | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
 52 | | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
 53 | |                               |                      |               MIG M. |
 54 | |===============================+======================+======================|
 55 | |   0  Tesla P40           Off  | 00000000:00:0C.0 Off |                    0 |
 56 | | N/A   22C    P8     9W / 250W |      0MiB / 23040MiB |      0%      Default |
 57 | |                               |                      |                  N/A |
 58 | +-------------------------------+----------------------+----------------------+
 59 | |   1  Tesla P40           Off  | 00000000:00:0D.0 Off |                    0 |
 60 | | N/A   20C    P8     9W / 250W |      0MiB / 23040MiB |      0%      Default |
 61 | |                               |                      |                  N/A |
 62 | +-------------------------------+----------------------+----------------------+
 63 | |   2  Tesla P40           Off  | 00000000:00:0E.0 Off |                    0 |
 64 | | N/A   22C    P8     9W / 250W |      0MiB / 23040MiB |      0%      Default |
 65 | |                               |                      |                  N/A |
 66 | +-------------------------------+----------------------+----------------------+
 67 | |   3  Tesla P40           Off  | 00000000:00:0F.0 Off |                    0 |
 68 | | N/A   23C    P8    12W / 250W |      0MiB / 23040MiB |      0%      Default |
 69 | |                               |                      |                  N/A |
 70 | +-------------------------------+----------------------+----------------------+
 71 | 
 72 | +-----------------------------------------------------------------------------+
 73 | | Processes:                                                                  |
 74 | |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
 75 | |        ID   ID                                                   Usage      |
 76 | |=============================================================================|
 77 | |  No running processes found                                                 |
 78 | +-----------------------------------------------------------------------------+
 79 | ```
 80 | 
 81 | ## step 4, GPU 分配及集群设置
 82 | 
 83 | 在 `dockercompose.yml` 分别指定不同的 GPU 和显存上限，千问我分配了24G，百川分配了双卡共48G。
 84 | 
 85 | ```
 86 |   #百川
 87 |   fastchat-worker-baichuan:
 88 |     ...
 89 |     deploy:
 90 |       resources:
 91 |         reservations:
 92 |           devices:
 93 |             - driver: nvidia
 94 |               device_ids: ['2', '3']
 95 |               capabilities: [gpu]
 96 |     entrypoint: ["python3.9", "-m", "fastchat.serve.model_worker", ..., "--device=cuda", "--num-gpus=2", "--max-gpu-memory=24GB"]
 97 | ```
 98 | 
 99 | `fastchat` 支持分布式 `worker` 架构，我这里是单机多卡，唯一的区别是设置 `worker` 的控制器参数。
100 | 
101 | ```
102 | --controller-address=http://fastchat-controller:21001
103 | ```
104 | 
105 | 如果你离线下载了所有模型，可以在 `docker-compose.yml` 中指定挂载路径，否则会自动下载。
106 | 
107 | 模型的下载命令，目前国内可用:
108 | 
109 | ```shell
110 | git clone git@hf.co:Qwen/Qwen-7B-Chat
111 | git clone git@hf.co:baichuan-inc/Baichuan-13B-Chat
112 | ```
113 | 
114 | ## 启动服务
115 | 
116 | ```shell
117 | # 前台启动
118 | docker compose up
119 | 
120 | # 后台启动
121 | docker compose up -d
122 | 
123 | # 查看进程
124 | docker compose ps
125 | 
126 | # 进入容器
127 | docker compose exec -it fastchat-controller bash
128 | ```
129 | 
130 | 然后访问 `http://127.0.0.1:7860`


--------------------------------------------------------------------------------