├── .github
├── 8raspi.jpg
├── 8raspi2.jpg
├── cover.png
└── workflows
│ └── main.yml
├── .gitignore
├── .vscode
└── launch.json
├── LICENSE
├── Makefile
├── README.md
├── converter
├── .gitignore
├── convert-hf.py
├── convert-llama.py
├── convert-tokenizer-hf.py
├── convert-tokenizer-llama2.py
├── convert-tokenizer-llama3.py
├── requirements.txt
├── tokenizer-writer.py
├── writer-test.py
└── writer.py
├── docs
├── HUGGINGFACE.md
└── LLAMA.md
├── examples
├── chat-api-client.js
├── macbeth.sh
└── n-workers.sh
├── launch.py
├── report
└── report.pdf
└── src
├── api-types.hpp
├── app.cpp
├── app.hpp
├── dllama-api.cpp
├── dllama.cpp
├── json.hpp
├── llm.cpp
├── llm.hpp
├── mmap.hpp
├── nn
├── llamafile
│ ├── sgemm.cpp
│ └── sgemm.hpp
├── nn-config-builder.hpp
├── nn-core.cpp
├── nn-core.hpp
├── nn-cpu-ops-test.cpp
├── nn-cpu-ops.cpp
├── nn-cpu-ops.hpp
├── nn-cpu-test.cpp
├── nn-cpu.cpp
├── nn-cpu.hpp
├── nn-executor.cpp
├── nn-executor.hpp
├── nn-network.cpp
├── nn-network.hpp
├── nn-quants.cpp
├── nn-quants.hpp
├── nn-vulkan-test.cpp
├── nn-vulkan.cpp
├── nn-vulkan.hpp
├── pthread.h
└── vulkan
│ ├── cast-forward-f32-f32.comp
│ ├── cast-forward-f32-q80.comp
│ ├── embedding-forward-f32-f32.comp
│ ├── inv-rms-forward-f32-f32.comp
│ ├── matmul-forward-f32-f32-f32.comp
│ ├── matmul-forward-q80-q40-f32.comp
│ ├── merge-add-forward-f32-f32.comp
│ ├── merge-add-forward-q80-f32.comp
│ ├── mul-forward-f32-f32.comp
│ ├── multi-head-att-forward-f32-f32.comp
│ ├── rms-norm-forward-f32-f32-f32.comp
│ ├── rope-forward-f32-f32.comp
│ ├── shift-forward-f32-f32.comp
│ └── silu-forward-f32-f32.comp
├── tokenizer-test.cpp
├── tokenizer.cpp
└── tokenizer.hpp
/.github/8raspi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/b4rtaz/distributed-llama/a16d2f03e66437088dce2ba4b82304a8101c074f/.github/8raspi.jpg
--------------------------------------------------------------------------------
/.github/8raspi2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/b4rtaz/distributed-llama/a16d2f03e66437088dce2ba4b82304a8101c074f/.github/8raspi2.jpg
--------------------------------------------------------------------------------
/.github/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/b4rtaz/distributed-llama/a16d2f03e66437088dce2ba4b82304a8101c074f/.github/cover.png
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: main
2 | on:
3 | pull_request:
4 | branches:
5 | - main
6 | - feat/nn
7 | push:
8 | branches:
9 | - main
10 | - feat/nn
11 | jobs:
12 | build-linux:
13 | name: Linux
14 | runs-on: ${{matrix.os}}
15 | strategy:
16 | matrix:
17 | os:
18 | - ubuntu-latest
19 | platforms:
20 | - linux/arm64
21 | - linux/amd64
22 | steps:
23 | - name: Checkout Repo
24 | uses: actions/checkout@v3
25 | - name: Dependencies
26 | id: dependencies
27 | run: sudo apt-get update && sudo apt-get install build-essential
28 | - name: Build
29 | id: build
30 | run: |
31 | make dllama
32 | make nn-cpu-test
33 | make nn-cpu-ops-test
34 | make tokenizer-test
35 | - name: nn-cpu-test
36 | run: ./nn-cpu-test
37 | - name: nn-cpu-ops-test
38 | run: ./nn-cpu-ops-test
39 | - name: tokenizer-test
40 | run: ./tokenizer-test
41 |
42 | build-windows:
43 | name: Windows
44 | runs-on: windows-latest
45 | steps:
46 | - name: Checkout Repo
47 | uses: actions/checkout@v3
48 | - name: Dependencies
49 | id: dependencies
50 | run: choco install make
51 | - name: Build
52 | id: build
53 | run: |
54 | make dllama
55 | make nn-cpu-test
56 | make nn-cpu-ops-test
57 | make tokenizer-test
58 | - name: nn-cpu-test
59 | run: ./nn-cpu-test
60 | - name: nn-cpu-ops-test
61 | run: ./nn-cpu-ops-test
62 | - name: tokenizer-test
63 | run: ./tokenizer-test
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/settings.json
2 |
3 | *.o
4 | *.0
5 | *.dSYM
6 | *.data
7 | *.temp
8 | *.tmp
9 | __pycache__
10 |
11 | *-test
12 | /models
13 | main
14 | run*.sh
15 | server
16 | /dllama
17 | /dllama-*
18 | *.exe
19 | *.spv
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "0.2.0",
3 | "configurations": [
4 | {
5 | "name": "main",
6 | "type": "cppdbg",
7 | "request": "launch",
8 | "program": "${workspaceFolder}/main",
9 | "args": [],
10 | "stopAtEntry": false,
11 | "cwd": "${workspaceFolder}",
12 | "environment": [],
13 | "externalConsole": false,
14 | "MIMode": "lldb"
15 | }
16 | ]
17 | }
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2024 Bartłomiej Tadych (b4rtaz)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | CXX = g++
2 | CXXFLAGS = -std=c++11 -Werror -Wformat -Werror=format-security
3 |
4 | ifndef TERMUX_VERSION
5 | CXXFLAGS += -march=native -mtune=native
6 | endif
7 |
8 | ifdef DEBUG
9 | CXXFLAGS += -g -fsanitize=address
10 | else
11 | CXXFLAGS += -O3
12 | endif
13 |
14 | ifdef WVLA
15 | CXXFLAGS += -Wvla-extension
16 | endif
17 |
18 | ifdef DLLAMA_VULKAN
19 | CGLSLC = glslc
20 |
21 | ifeq ($(OS),Windows_NT)
22 | LIBS += -L$(VK_SDK_PATH)\lib -lvulkan-1
23 | CXXFLAGS += -DDLLAMA_VULKAN -I$(VK_SDK_PATH)\include
24 | else
25 | LIBS += -lvulkan
26 | CXXFLAGS += -DDLLAMA_VULKAN
27 | endif
28 |
29 | DEPS += nn-vulkan.o
30 | endif
31 |
32 | ifeq ($(OS),Windows_NT)
33 | LIBS += -lws2_32
34 | DELETE_CMD = del /f
35 | else
36 | LIBS += -lpthread
37 | DELETE_CMD = rm -fv
38 | endif
39 |
40 | .PHONY: clean dllama
41 |
42 | clean:
43 | $(DELETE_CMD) *.o dllama dllama-* socket-benchmark mmap-buffer-* *-test *.exe
44 |
45 | # nn
46 | nn-quants.o: src/nn/nn-quants.cpp
47 | $(CXX) $(CXXFLAGS) -c $^ -o $@
48 | nn-core.o: src/nn/nn-core.cpp
49 | $(CXX) $(CXXFLAGS) -c $^ -o $@
50 | nn-executor.o: src/nn/nn-executor.cpp
51 | $(CXX) $(CXXFLAGS) -c $^ -o $@
52 | nn-network.o: src/nn/nn-network.cpp
53 | $(CXX) $(CXXFLAGS) -c $^ -o $@
54 | llamafile-sgemm.o: src/nn/llamafile/sgemm.cpp
55 | $(CXX) $(CXXFLAGS) -c $^ -o $@
56 | nn-cpu-ops.o: src/nn/nn-cpu-ops.cpp
57 | $(CXX) $(CXXFLAGS) -c $^ -o $@
58 | nn-cpu.o: src/nn/nn-cpu.cpp
59 | $(CXX) $(CXXFLAGS) -c $^ -o $@
60 | nn-cpu-test: src/nn/nn-cpu-test.cpp nn-quants.o nn-core.o nn-executor.o llamafile-sgemm.o nn-cpu-ops.o nn-cpu.o
61 | $(CXX) $(CXXFLAGS) $^ -o $@ $(LIBS)
62 | nn-cpu-ops-test: src/nn/nn-cpu-ops-test.cpp nn-quants.o nn-core.o nn-executor.o llamafile-sgemm.o nn-cpu.o
63 | $(CXX) $(CXXFLAGS) $^ -o $@ $(LIBS)
64 | nn-vulkan.o: src/nn/nn-vulkan.cpp
65 | $(CXX) $(CXXFLAGS) -c $^ -o $@
66 |
67 | ifdef DLLAMA_VULKAN
68 | VULKAN_SHADER_SRCS := $(wildcard src/nn/vulkan/*.comp)
69 | VULKAN_SHADER_BINS := $(VULKAN_SHADER_SRCS:.comp=.spv)
70 | DEPS += $(VULKAN_SHADER_BINS)
71 |
72 | %.spv: %.comp
73 | $(CGLSLC) -c $< -o $@
74 | nn-vulkan-test: src/nn/nn-vulkan-test.cpp nn-quants.o nn-core.o nn-executor.o nn-vulkan.o ${DEPS}
75 | $(CXX) $(CXXFLAGS) $(filter-out %.spv, $^) -o $@ $(LIBS)
76 | endif
77 |
78 | # llm
79 | tokenizer.o: src/tokenizer.cpp
80 | $(CXX) $(CXXFLAGS) -c $^ -o $@
81 | llm.o: src/llm.cpp
82 | $(CXX) $(CXXFLAGS) -c $^ -o $@
83 | app.o: src/app.cpp
84 | $(CXX) $(CXXFLAGS) -c $^ -o $@
85 | tokenizer-test: src/tokenizer-test.cpp nn-quants.o nn-core.o llamafile-sgemm.o nn-cpu-ops.o tokenizer.o
86 | $(CXX) $(CXXFLAGS) $^ -o $@ $(LIBS)
87 | dllama: src/dllama.cpp nn-quants.o nn-core.o nn-executor.o nn-network.o llamafile-sgemm.o nn-cpu-ops.o nn-cpu.o tokenizer.o llm.o app.o ${DEPS}
88 | $(CXX) $(CXXFLAGS) $(filter-out %.spv, $^) -o $@ $(LIBS)
89 | dllama-api: src/dllama-api.cpp nn-quants.o nn-core.o nn-executor.o nn-network.o llamafile-sgemm.o nn-cpu-ops.o nn-cpu.o tokenizer.o llm.o app.o ${DEPS}
90 | $(CXX) $(CXXFLAGS) $(filter-out %.spv, $^) -o $@ $(LIBS)
91 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # Distributed Llama
4 |
5 | [](https://github.com/b4rtaz/distributed-llama/actions) [](/LICENSE) [](https://github.com/sponsors/b4rtaz) [](https://discord.com/widget?id=1245814812353495070&theme=dark)
6 |
7 | Connect home devices into a powerful cluster to accelerate LLM inference. More devices mean faster performance, leveraging tensor parallelism and high-speed synchronization over Ethernet.
8 |
9 | Supports Linux, macOS, and Windows. Optimized for ARM and x86_64 AVX2 CPUs.
10 |
11 | **News**
12 | - 23 Mar 2025 - [🌋 Experimental Vulkan support](https://github.com/b4rtaz/distributed-llama/releases/tag/v0.13.0)
13 | - 12 Feb 2025 - 🚧 Merged the [fundamental codebase refactor](https://github.com/b4rtaz/distributed-llama/releases/tag/v0.12.0)
14 | - 9 Jan 2025 - [🍎 Llama 3.3 70B on 4 x Mac Mini M4 Pro 24GB RAM](https://github.com/b4rtaz/distributed-llama/discussions/147)
15 | - 28 Jul 2024 - [🌳 How to Run Llama 3.1 405B on Home Devices? Build AI Cluster!](https://medium.com/@b4rtaz/how-to-run-llama-3-405b-on-home-devices-build-ai-cluster-ad0d5ad3473b)
16 |
17 |
18 | ### 🔥 Setup Root Node by Single Command
19 |
20 | Python 3 and C++ compiler required. The command will download the model and the tokenizer.
21 |
22 | | Model | Size | Command |
23 | | --------------------------------- | -------- | ---------------------------------------------------- |
24 | | Llama 3.1 8B Instruct Q40 | 6.32 GB | `python launch.py llama3_1_8b_instruct_q40` |
25 | | Llama 3.1 405B Instruct Q40. | 238 GB | `python launch.py llama3_1_405b_instruct_q40`. |
26 | | Llama 3.2 1B Instruct Q40 | 1.7 GB | `python launch.py llama3_2_1b_instruct_q40` |
27 | | Llama 3.2 3B Instruct Q40 | 3.4 GB | `python launch.py llama3_2_3b_instruct_q40` |
28 | | Llama 3.3 70B Instruct Q40 | 40 GB | `python launch.py llama3_3_70b_instruct_q40` |
29 | | DeepSeek R1 Distill Llama 8B Q40 | 6.32 GB | `python launch.py deepseek_r1_distill_llama_8b_q40` |
30 |
31 | ### 🛠️ Convert Model Manually
32 |
33 | Supported architectures: Llama.
34 |
35 | * [How to Convert Llama 3.1](./docs/LLAMA.md)
36 | * [How to Convert Hugging Face Model](./docs/HUGGINGFACE.md)
37 |
38 | ### 🚧 Known Limitations
39 |
40 | * You can run Distributed Llama only on 1, 2, 4... 2^n nodes.
41 | * The maximum number of nodes is equal to the number of KV heads in the model [#70](https://github.com/b4rtaz/distributed-llama/issues/70).
42 | * Only the following quantizations are supported [#183](https://github.com/b4rtaz/distributed-llama/issues/183):
43 | * `q40` model with `q80` `buffer-float-type`
44 | * `f32` model with `f32` `buffer-float-type`
45 |
46 | ### 👷 Architecture
47 |
48 | The project is split up into two parts:
49 | * **Root node** - it's responsible for loading the model and weights and forward them to workers. Also, it synchronizes the state of the neural network. The root node is also a worker, it processes own slice of the neural network.
50 | * **Worker node** - it processes own slice of the neural network. It doesn't require any configuration related to the model.
51 |
52 | You always need the root node and you can add 2^n - 1 worker nodes to speed up the inference. The RAM usage of the neural network is split up across all nodes. The root node requires a bit more RAM than worker nodes.
53 |
54 | ### 🎹 Commands
55 |
56 | * `dllama inference` - run the inference with a simple benchmark,
57 | * `dllama chat` - run the CLI chat,
58 | * `dllama worker` - run the worker node,
59 | * `dllama-api` - run the API server.
60 |
61 |
62 |
63 | 🎹 Supported Arguments
64 |
65 |
Inference, Chat, API
66 |
67 | | Argument | Description | Example |
68 | | ---------------------------- | ---------------------------------------------------------------- | -------------------------------------- |
69 | | `--model ` | Path to model. | `dllama_model_meta-llama-3-8b_q40.m` |
70 | | `--tokenizer ` | Tokenizer to model. | `dllama_tokenizer_llama3.t` |
71 | | `--buffer-float-type ` | Float precision of synchronization. | `q80` |
72 | | `--workers ` | Addresses of workers (ip:port), separated by space. | `10.0.0.1:9999 10.0.0.2:9999` |
73 | | `--max-seq-len ` | The maximum sequence length, it helps to reduce the RAM usage. | `4096` |
74 |
75 | Inference, Chat, Worker, API
76 |
77 | | Argument | Description | Example |
78 | | ---------------------------- | --------------------------------------------------------------------- | ----------------------------------- |
79 | | `--nthreads ` | Amount of threads. Don't set a higher value than number of CPU cores. | `4` |
80 |
81 | Worker, API
82 |
83 | | Argument | Description | Example |
84 | | ---------------------------- | --------------------------------- | ----------------- |
85 | | `--port ` | Binding port. | `9999` |
86 |
87 | Inference
88 |
89 | | Argument | Description | Example |
90 | | ---------------------------- | ------------------------------ | ------------------ |
91 | | `--prompt ` | Initial prompt. | `"Hello World"` |
92 | | `--steps ` | Number of tokens to generate. | `256` |
93 |
94 |
95 |
96 | ## 📊 Measurements
97 |
98 | Please check the [discussions](https://github.com/b4rtaz/distributed-llama/discussions) section, where many measurements were published on different configurations.
99 |
100 | ## 🚀 Setup
101 |
102 | Select and expand one of the sections below:
103 |
104 |
105 |
106 | 💻 MacOS, Linux, or Windows
107 |
108 |
You need x86_64 AVX2 CPUs or ARM CPUs. Different devices may have different CPUs.
109 |
110 | #### MacOS or Linux
111 |
112 | The below instructions are for Debian-based distributions but you can easily adapt them to your distribution, macOS.
113 |
114 | 1. Install Git and GCC:
115 | ```sh
116 | sudo apt install git build-essential
117 | ```
118 | 2. Clone this repository and compile Distributed Llama on all computers:
119 | ```sh
120 | git clone https://github.com/b4rtaz/distributed-llama.git
121 | cd distributed-llama
122 | make dllama
123 | make dllama-api
124 | ```
125 |
126 | Continue to point 3.
127 |
128 | #### Windows
129 |
130 | 1. Install Git and Mingw (via [Chocolatey](https://chocolatey.org/install)):
131 | ```powershell
132 | choco install mingw
133 | ```
134 | 2. Clone this repository and compile Distributed Llama on all computers:
135 | ```sh
136 | git clone https://github.com/b4rtaz/distributed-llama.git
137 | cd distributed-llama
138 | make dllama
139 | make dllama-api
140 | ```
141 |
142 | Continue to point 3.
143 |
144 | #### Run Cluster
145 |
146 | 3. Transfer weights and the tokenizer file to the root computer.
147 | 4. Run worker nodes on worker computers:
148 | ```sh
149 | ./dllama worker --port 9999 --nthreads 4
150 | ```
151 | 5. Run root node on the root computer:
152 | ```sh
153 | ./dllama inference --model dllama_model_meta-llama-3-8b_q40.m --tokenizer dllama_tokenizer_llama3.t --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 192.168.0.1:9999
154 | ```
155 |
156 | To add more worker nodes, just add more addresses to the `--workers` argument.
157 |
158 | ```
159 | ./dllama inference ... --workers 192.168.0.1:9999 192.168.0.2:9999 192.168.0.3:9999
160 | ```
161 |
162 |
163 |
164 |
165 |
166 | 📟 Raspberry Pi
167 |
168 |
169 |
170 | 1. Install `Raspberry Pi OS Lite (64 bit)` on your Raspberry Pi devices. This OS doesn't have desktop environment.
171 | 2. Connect all devices to your switch or router.
172 | 3. Connect to all devices via SSH.
173 | ```
174 | ssh user@raspberrypi1.local
175 | ssh user@raspberrypi2.local
176 | ```
177 | 4. Install Git:
178 | ```sh
179 | sudo apt install git
180 | ```
181 | 5. Clone this repository and compile Distributed Llama on all devices:
182 | ```sh
183 | git clone https://github.com/b4rtaz/distributed-llama.git
184 | cd distributed-llama
185 | make dllama
186 | make dllama-api
187 | ```
188 | 6. Transfer weights and the tokenizer file to the root device.
189 | 7. Optional: assign static IP addresses.
190 | ```sh
191 | sudo ip addr add 10.0.0.1/24 dev eth0 # 1th device
192 | sudo ip addr add 10.0.0.2/24 dev eth0 # 2th device
193 | ```
194 | 8. Run worker nodes on worker devices:
195 | ```sh
196 | sudo nice -n -20 ./dllama worker --port 9999 --nthreads 4
197 | ```
198 | 9. Run root node on the root device:
199 | ```sh
200 | sudo nice -n -20 ./dllama inference --model dllama_model_meta-llama-3-8b_q40.m --tokenizer dllama_tokenizer_llama3.t --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 10.0.0.2:9999
201 | ```
202 |
203 | To add more worker nodes, just add more addresses to the `--workers` argument.
204 |
205 | ```
206 | ./dllama inference ... --workers 10.0.0.2:9999 10.0.0.3:9999 10.0.0.4:9999
207 | ```
208 |
209 |
210 |
211 | ## ✋ Contribution
212 |
213 | Feel free to contribute to this project. For small changes, simply create a new merge request. For larger changes, please create an issue to discuss your plans. Please follow these guidelines when contributing:
214 |
215 | * Make only minimal changes and avoid modifying files that are not necessary.
216 | * Ensure the code is compatible across all supported systems and CPUs.
217 | * This repository is maintained in English.
218 |
219 | ## 💡 License
220 |
221 | This project is released under the MIT license.
222 |
223 | ## 📖 Citation
224 |
225 | ```
226 | @misc{dllama,
227 | author = {Bartłomiej Tadych},
228 | title = {Distributed Llama},
229 | year = {2024},
230 | publisher = {GitHub},
231 | journal = {GitHub repository},
232 | howpublished = {\url{https://github.com/b4rtaz/distributed-llama}},
233 | commit = {7eb77ca93ec0d502e28d36b6fb20039b449cbea4}
234 | }
235 | ```
236 |
--------------------------------------------------------------------------------
/converter/.gitignore:
--------------------------------------------------------------------------------
1 | *.t
2 | *.m
3 | *.bin
4 | */
5 |
--------------------------------------------------------------------------------
/converter/convert-hf.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import json
3 | import sys
4 | import os
5 | from writer import parseFloatType, writeTensor, writeHeader, FloatType
6 | from safetensors import safe_open
7 |
8 | class ArchType:
9 | LLAMA = 0xABCD00
10 |
11 | def permute(tensor, nHeads: int, nKvHeads: int):
12 | if nHeads != nKvHeads:
13 | nHeads = nKvHeads
14 | return (tensor.reshape(nHeads, 2, tensor.shape[0] // nHeads // 2, *tensor.shape[1:]).swapaxes(1, 2).reshape(tensor.shape))
15 |
16 | class Processor:
17 | def __init__(self, config):
18 | self.config = config
19 | self.currentModelIndex = None
20 | self.currentModel = None
21 | self.currentModelKeys = None
22 | self.layerMap = {}
23 | self.plan = []
24 |
25 | def __unloadModel(self):
26 | if self.currentModel:
27 | del self.currentModel
28 | self.currentModel = None
29 | gc.collect()
30 |
31 | def __loadModel(self, index: int):
32 | if (self.currentModelIndex == index):
33 | return
34 | self.__unloadModel()
35 | filePath = self.config['files'][index]
36 | fileName = os.path.basename(filePath)
37 | print(f'💿 Loading file {fileName}...')
38 | self.currentModel = safe_open(filePath, framework='pt', device='cpu')
39 | self.currentModelKeys = list(self.currentModel.keys())
40 | for key in self.currentModelKeys:
41 | self.layerMap[key] = index
42 | print(f'Found {len(self.currentModelKeys)} layers')
43 | self.currentModelIndex = index
44 |
45 | def __permuteQ(self, tensor):
46 | return permute(tensor, self.config['n_heads'], self.config['n_heads'])
47 |
48 | def __permuteK(self, tensor):
49 | return permute(tensor, self.config['n_heads'], self.config['n_kv_heads'])
50 |
51 | def __preparePlan(self):
52 | wt = self.config['weights_float_type']
53 | p = self.plan
54 | p.append([FloatType.F32,
55 | 'model.embed_tokens.weight'])
56 | for l in range(0, self.config['n_layers']):
57 | p.append([wt, self.__permuteQ,
58 | f'model.layers.{l}.self_attn.q_proj.weight'])
59 | p.append([wt, self.__permuteK,
60 | f'model.layers.{l}.self_attn.k_proj.weight'])
61 | p.append([wt,
62 | f'model.layers.{l}.self_attn.v_proj.weight'])
63 | p.append([wt,
64 | f'model.layers.{l}.self_attn.o_proj.weight'])
65 |
66 | if (self.config['n_experts'] > 0):
67 | for e in range(self.config['n_experts']):
68 | p.append([wt,
69 | f'model.layers.{l}.block_sparse_moe.experts.{e}.w3.weight']) # up
70 | p.append([wt,
71 | f'model.layers.{l}.block_sparse_moe.experts.{e}.w1.weight']) # gate
72 | p.append([wt,
73 | f'model.layers.{l}.block_sparse_moe.experts.{e}.w2.weight']) # down
74 | else:
75 | p.append([wt,
76 | f'model.layers.{l}.mlp.gate_proj.weight']) # gate
77 | p.append([wt,
78 | f'model.layers.{l}.mlp.down_proj.weight']) # down
79 | p.append([wt,
80 | f'model.layers.{l}.mlp.up_proj.weight']) # up
81 |
82 | p.append([FloatType.F32,
83 | f'model.layers.{l}.input_layernorm.weight'])
84 | p.append([FloatType.F32,
85 | f'model.layers.{l}.post_attention_layernorm.weight'])
86 | p.append([FloatType.F32,
87 | 'model.norm.weight'])
88 | p.append([wt,
89 | 'lm_head.weight', 'model.embed_tokens.weight'])
90 |
91 | def write(self, outputFile: str):
92 | self.__preparePlan()
93 | for planItem in self.plan:
94 | lookup = planItem[1:]
95 | transform = None
96 | if (callable(lookup[0])):
97 | transform = lookup[0]
98 | lookup = lookup[1:]
99 |
100 | if (self.currentModelIndex == None):
101 | modelIndex = 0
102 | else:
103 | modelIndex = None
104 | for layerName in lookup:
105 | if (layerName in self.layerMap):
106 | modelIndex = self.layerMap[layerName]
107 | break
108 | if (modelIndex is None):
109 | modelIndex = self.currentModelIndex + 1
110 | self.__loadModel(modelIndex)
111 |
112 | tensor = None
113 | for layerName in lookup:
114 | if (layerName in self.currentModelKeys):
115 | tensor = self.currentModel.get_tensor(layerName)
116 | break
117 | if tensor is None:
118 | raise Exception(f'Layer {lookup[0]} not found')
119 | print(f'🔶 Writing tensor {layerName} {tensor.shape}...')
120 |
121 | floatType = planItem[0]
122 | if (transform):
123 | tensor = transform(tensor)
124 | writeTensor(outputFile, tensor, floatType)
125 |
126 | def parseArchType(type: str):
127 | archType = {
128 | 'llama': ArchType.LLAMA,
129 | 'mistral': ArchType.LLAMA,
130 | }.get(type)
131 | if (archType is None):
132 | raise Exception(f'Unsupported arch type: {type}')
133 | return archType
134 |
135 | def parseHiddenAct(act: str):
136 | hiddenAct = {
137 | 'gelu': 0,
138 | 'silu': 1
139 | }.get(act)
140 | if (hiddenAct is None):
141 | raise Exception(f'Unsupported hidden act: {act}')
142 | return hiddenAct
143 |
144 | def parseRopeType(rt: str):
145 | ropeType = {
146 | 'llama3': 2, # LLAMA3_1
147 | }.get(rt)
148 | if (ropeType is None):
149 | raise Exception(f'Unsupported rope type: {ropeType}')
150 | return ropeType
151 |
152 | def loadConfig(folderPath: str, weightsFloatType: int):
153 | allFiles = os.listdir(folderPath)
154 | allFiles.sort()
155 | with open(os.path.join(folderPath, 'config.json')) as fc:
156 | config = json.load(fc)
157 | files = []
158 | for fileName in allFiles:
159 | if fileName.endswith('.safetensors') and not fileName.startswith('.'):
160 | files.append(os.path.join(folderPath, fileName))
161 | if (len(files) == 0):
162 | raise Exception('Not found any model file')
163 |
164 | result = {
165 | 'version': 0,
166 | 'arch_type': parseArchType(config['model_type']),
167 | 'hidden_act': parseHiddenAct(config['hidden_act']),
168 | 'dim': config['hidden_size'],
169 | 'hidden_dim': config['intermediate_size'],
170 | 'n_layers': config['num_hidden_layers'],
171 | 'n_heads': config['num_attention_heads'],
172 | 'n_kv_heads': config['num_key_value_heads'],
173 | 'weights_float_type': weightsFloatType,
174 | 'max_seq_len': config['max_position_embeddings'],
175 | 'vocab_size': config['vocab_size'],
176 | 'files': files,
177 | }
178 |
179 | nExperts = config.get('num_local_experts')
180 | nActiveExperts = config.get('num_active_local_experts') or config.get('num_experts_per_tok')
181 | result['n_experts'] = int(nExperts) if nExperts is not None else 0
182 | result['n_active_experts'] = int(nActiveExperts) if nActiveExperts is not None else 0
183 |
184 | ropeTheta = config.get('rope_theta')
185 | if (ropeTheta is not None):
186 | result['rope_theta'] = int(ropeTheta)
187 |
188 | ropeScaling = config.get('rope_scaling')
189 | if (ropeScaling is not None):
190 | result['rope_scaling_factor'] = int(ropeScaling['factor'])
191 | result['rope_scaling_low_freq_factor'] = int(ropeScaling['low_freq_factor'])
192 | result['rope_scaling_high_freq_factory'] = int(ropeScaling['high_freq_factor'])
193 | result['rope_scaling_orig_max_seq_len'] = int(ropeScaling['original_max_position_embeddings'])
194 | result['rope_type'] = parseRopeType(ropeScaling['rope_type'])
195 | return result
196 |
197 | def printUsage():
198 | print('Usage: python convert-hf.py ')
199 | print()
200 | print('Options:')
201 | print(' The path to the folder containing the model files')
202 | print(' The float type of the weights (e.g. "q40")')
203 | print(' The name of the model (e.g. "llama3")')
204 |
205 | if __name__ == '__main__':
206 | if (len(sys.argv) < 4):
207 | printUsage()
208 | exit(1)
209 |
210 | sourceFolderPath = sys.argv[1]
211 | weightsFloatType = parseFloatType(sys.argv[2])
212 | name = sys.argv[3]
213 | outputFileName = f'dllama_model_{name}_{sys.argv[2]}.m'
214 |
215 | print(f'Output file: {outputFileName}')
216 |
217 | config = loadConfig(sourceFolderPath, weightsFloatType)
218 |
219 | with open(outputFileName, 'wb') as outputFile:
220 | writeHeader(outputFile, config)
221 | processor = Processor(config)
222 | processor.write(outputFile)
223 |
224 | print(f'✅ {outputFileName} created successfully')
--------------------------------------------------------------------------------
/converter/convert-llama.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | import torch
5 | import math
6 | import numpy as np
7 | from writer import writeTensor, writeHeader, parseFloatType, strFloatType, FloatType
8 | from pathlib import Path
9 |
10 | LAYER_CHUNK_SIZE = 48
11 |
12 | def convert(modelPath, outputPath, targetFloatType):
13 | paramsPath = os.path.join(modelPath, 'params.json')
14 | with open(paramsPath) as f:
15 | params = json.load(f)
16 | if (params['vocab_size'] < 1):
17 | raise Exception('vocab_size is invalid, please update params.json file')
18 | if (params.get('max_seq_len') is None):
19 | raise Exception('max_seq_len is required, please update params.json file')
20 | params['n_kv_heads'] = params.get('n_kv_heads') or params['n_heads']
21 | params['head_size'] = params['dim'] / params['n_heads']
22 | params['arch_type'] = 0xABCD00
23 | params['n_experts'] = 0
24 | params['n_active_experts'] = 0
25 | params['weights_float_type'] = targetFloatType
26 | if ('rope_theta' in params):
27 | params['rope_theta'] = int(params['rope_theta'])
28 |
29 | modelPaths = sorted(list(Path(modelPath).glob('consolidated.*.pth')))
30 | nSlices = len(modelPaths)
31 |
32 | layers = []
33 | layers.append('tok_embeddings.weight')
34 | for layerIndex in range(0, params['n_layers']):
35 | layers.append(f'layers.{layerIndex}.attention.wq.weight')
36 | layers.append(f'layers.{layerIndex}.attention.wk.weight')
37 | layers.append(f'layers.{layerIndex}.attention.wv.weight')
38 | layers.append(f'layers.{layerIndex}.attention.wo.weight')
39 | layers.append(f'layers.{layerIndex}.feed_forward.w1.weight')
40 | layers.append(f'layers.{layerIndex}.feed_forward.w2.weight')
41 | layers.append(f'layers.{layerIndex}.feed_forward.w3.weight')
42 | layers.append(f'layers.{layerIndex}.attention_norm.weight')
43 | layers.append(f'layers.{layerIndex}.ffn_norm.weight')
44 | layers.append('norm.weight')
45 | layers.append('output.weight')
46 |
47 | isHeaderWrote = False
48 | outFile = open(outputPath, 'wb')
49 |
50 | nChunks = math.ceil(len(layers) / LAYER_CHUNK_SIZE)
51 | for chunkIndex in range(0, nChunks):
52 | chunkLayerNames = layers[LAYER_CHUNK_SIZE * chunkIndex:LAYER_CHUNK_SIZE * (chunkIndex + 1)]
53 | models = {}
54 | for layerName in chunkLayerNames:
55 | models[layerName] = []
56 |
57 | print(f'💿 Chunking model {chunkIndex + 1}/{nChunks}...')
58 |
59 | for modelPath in modelPaths:
60 | model = torch.load(modelPath, map_location='cpu')
61 | for modelKey in model:
62 | if (modelKey in chunkLayerNames):
63 | models[modelKey].append(model[modelKey])
64 | if not isHeaderWrote:
65 | params['hidden_dim'] = model['layers.0.feed_forward.w1.weight'].shape[0] * nSlices
66 | writeHeader(outFile, params)
67 | isHeaderWrote = True
68 | del model
69 |
70 | for layerName in chunkLayerNames:
71 | if layerName == 'rope.freqs':
72 | continue
73 |
74 | isAxis1 = (
75 | layerName == 'tok_embeddings.weight' or
76 | layerName.endswith('.attention.wo.weight') or
77 | layerName.endswith('.feed_forward.w2.weight')
78 | )
79 | isAlwaysF32 = (
80 | layerName == 'tok_embeddings.weight' or
81 | layerName.endswith('.attention_norm.weight') or
82 | layerName.endswith('.ffn_norm.weight') or
83 | layerName == 'norm.weight'
84 | )
85 | floatType = FloatType.F32 if isAlwaysF32 else targetFloatType
86 |
87 | tensors = models[layerName]
88 | if len(tensors) == 1 or len(tensors[0].shape) == 1:
89 | tensor = tensors[0]
90 | else:
91 | tensor = torch.cat(tensors, dim=(1 if isAxis1 else 0))
92 |
93 | print(f'🔶 Exporting {layerName} {tensor.shape}...')
94 | writeTensor(outFile, tensor, floatType)
95 |
96 | del models
97 |
98 | outFile.close()
99 |
100 | def usage():
101 | print('Usage: python convert-llama.py ')
102 | exit(1)
103 |
104 | if __name__ == '__main__':
105 | if (len(sys.argv) < 3):
106 | usage()
107 |
108 | modelPath = sys.argv[1]
109 | targetFloatType = parseFloatType(sys.argv[2])
110 | targetFloatTypeStr = strFloatType(targetFloatType)
111 |
112 | modelName = os.path.basename(modelPath)
113 | outputFileName = f'dllama_model_{modelName.lower()}_{targetFloatTypeStr}.m'
114 |
115 | print(f'Model name: {modelName}')
116 | print(f'Target float type: {targetFloatTypeStr}')
117 | print(f'Target file: {outputFileName}')
118 |
119 | convert(modelPath, outputFileName, targetFloatType)
120 |
121 | print('Done!')
122 |
--------------------------------------------------------------------------------
/converter/convert-tokenizer-hf.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | import os
4 | from sentencepiece import SentencePieceProcessor
5 | from transformers import PreTrainedTokenizerFast
6 | writer = __import__('tokenizer-writer')
7 |
8 | def openJson(path):
9 | with open(path, 'r', encoding='utf-8') as file:
10 | return json.load(file)
11 |
12 | def unicodeToBytes():
13 | # https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/encoder.py#L9
14 | bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
15 | cs = bs[:]
16 | n = 0
17 | for b in range(2 ** 8):
18 | if b not in bs:
19 | bs.append(b)
20 | cs.append(2 ** 8 + n)
21 | n += 1
22 | cs = [chr(n) for n in cs]
23 | return dict(zip(cs, bs))
24 |
25 | class TokensResolver:
26 | def __init__(self, dirPath, tokenizerConfig):
27 | self.dirPath = dirPath
28 | self.tokenizerConfig = tokenizerConfig
29 | self.bosId = None
30 | self.eosIds = None
31 | self.tokens = []
32 | self.scores = []
33 |
34 | def resolvePreTrainedTokenizerFast(self):
35 | utb = unicodeToBytes()
36 | tokenizer = PreTrainedTokenizerFast(tokenizer_file = os.path.join(self.dirPath, 'tokenizer.json'))
37 | vocabLen = len(tokenizer.get_vocab())
38 | for i in range(vocabLen):
39 | tokenChars = list(tokenizer.convert_ids_to_tokens([i])[0])
40 | tokenBytes = []
41 | for chr in tokenChars:
42 | if (chr in utb):
43 | tokenBytes.append(utb[chr])
44 | else:
45 | tokenBytes += list(chr.encode('utf-8'))
46 | self.tokens.append(bytes(tokenBytes))
47 | self.scores.append(-float(i))
48 |
49 | self.bosId = tokenizer.bos_token_id
50 | if (tokenizer.eos_token_id):
51 | self.eosIds = [tokenizer.eos_token_id]
52 | if (self.bosId is None or self.eosId is None):
53 | config = openJson(os.path.join(self.dirPath, 'config.json'))
54 | if (self.bosId is None):
55 | self.bosId = config['bos_token_id']
56 | if (self.eosIds is None):
57 | self.eosIds = config['eos_token_id']
58 | if isinstance(self.eosIds, list):
59 | self.eosIds = self.eosIds[:2] # TODO: add support more than 2 eos ids
60 | else:
61 | self.eosIds = [self.eosIds]
62 |
63 | def resolveLlamaTokenizer(self):
64 | modelPath = os.path.join(self.dirPath, 'tokenizer.model')
65 | processor = SentencePieceProcessor(model_file=modelPath)
66 |
67 | assert processor.vocab_size() == processor.get_piece_size()
68 | self.bosId = processor.bos_id()
69 | self.eosIds = [processor.eos_id()]
70 | vocabSize = processor.vocab_size()
71 | for i in range(vocabSize):
72 | t = processor.id_to_piece(i)
73 | s = processor.get_score(i)
74 | t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
75 | # Check for byte characters
76 | if len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
77 | # For example, "<0x0A>"" is a newline character
78 | b = bytearray.fromhex(t[3:-1])
79 | else:
80 | b = t.encode('utf-8')
81 | self.tokens.append(b)
82 | self.scores.append(s)
83 |
84 | def resolve(self):
85 | cls = self.tokenizerConfig['tokenizer_class']
86 | if (cls == 'PreTrainedTokenizerFast' or cls == 'LlamaTokenizerFast'):
87 | return self.resolvePreTrainedTokenizerFast()
88 | if (cls == 'LlamaTokenizer'):
89 | return self.resolveLlamaTokenizer()
90 | raise Exception(f'Tokenizer {cls} is not supported')
91 |
92 |
93 | def printUsage():
94 | print('Usage: python convert-tokenizer-hf.py ')
95 | print()
96 | print('Options:')
97 | print(' The path to the folder with tokenizer_config.json')
98 | print(' The name of the tokenizer (e.g. "llama3")')
99 |
100 | if __name__ == '__main__':
101 | if (len(sys.argv) < 2):
102 | printUsage()
103 | exit(1)
104 |
105 | dirPath = sys.argv[1]
106 | name = sys.argv[2]
107 | tokenizerConfig = openJson(os.path.join(dirPath, 'tokenizer_config.json'))
108 |
109 | resolver = TokensResolver(dirPath, tokenizerConfig)
110 | resolver.resolve()
111 |
112 | if (resolver.bosId is None or resolver.eosIds is None):
113 | raise Exception('Cannot resolve bosId or eosIds')
114 | print(f'bosId: {resolver.bosId} ({resolver.tokens[resolver.bosId]})')
115 | for eosId in resolver.eosIds:
116 | print(f'eosId: {eosId} ({resolver.tokens[eosId]})')
117 |
118 | chatTemplate = None
119 | chatExtraStop = None
120 | if ('chat_template' in tokenizerConfig):
121 | chatTemplate = tokenizerConfig['chat_template'].encode('utf-8')
122 | input = input('⏩ Enter value for chat extra stop (enter to skip): ')
123 | if (input != ''):
124 | chatExtraStop = input.encode('utf-8')
125 |
126 | outputFileName = f'dllama_tokenizer_{name}.t'
127 | with open(outputFileName, 'wb') as outputFile:
128 | writer.writeTokenizer(outputFile, {
129 | 'bos_id': resolver.bosId,
130 | 'eos_id': resolver.eosIds[0],
131 | 'chat_eos_id': resolver.eosIds[1 if len(resolver.eosIds) > 1 else 0],
132 | }, resolver.tokens, resolver.scores, chatTemplate, chatExtraStop)
133 | print(f'✅ Created {outputFileName}')
134 |
--------------------------------------------------------------------------------
/converter/convert-tokenizer-llama2.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | from sentencepiece import SentencePieceProcessor
4 | writer = __import__('tokenizer-writer')
5 |
6 | chatTemplate = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
7 |
8 | def printUsage():
9 | print('Usage: python convert-tokenizer-llama2.py ')
10 | print()
11 | print('Options:')
12 | print(' The path to the folder with llama2 folder path')
13 |
14 | if __name__ == '__main__':
15 | if (len(sys.argv) < 2):
16 | printUsage()
17 | exit(1)
18 |
19 | dirPath = sys.argv[1]
20 | modelPath = os.path.join(dirPath, 'tokenizer.model')
21 | processor = SentencePieceProcessor(model_file=modelPath)
22 |
23 | vocabSize = processor.vocab_size()
24 | tokens = []
25 | scores = []
26 | for i in range(vocabSize):
27 | t = processor.id_to_piece(i)
28 | s = processor.get_score(i)
29 | t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
30 | b = t.encode('utf-8')
31 | tokens.append(b)
32 | scores.append(s)
33 |
34 | outputFileName = 'dllama_tokenizer_llama2.t'
35 | with open(outputFileName, 'wb') as outputFile:
36 | writer.writeTokenizer(outputFile, {
37 | 'bos_id': processor.bos_id(),
38 | 'eos_id': processor.eos_id(),
39 | 'chat_eos_id': processor.eos_id(),
40 | }, tokens, scores, chatTemplate.encode('utf-8'), None)
41 |
42 | print(f'✅ Created {outputFileName}')
43 |
--------------------------------------------------------------------------------
/converter/convert-tokenizer-llama3.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import base64
3 | writer = __import__('tokenizer-writer')
4 |
5 | # Format of input file:
6 | # ```
7 | # IQ== 0
8 | # Ig== 1
9 | # Iw== 2
10 | # ...
11 | # ```
12 |
13 | nSpecialTokens = 256
14 | specialTokens = [
15 | '<|begin_of_text|>',
16 | '<|end_of_text|>',
17 | '<|reserved_special_token_0|>',
18 | '<|reserved_special_token_1|>',
19 | '<|reserved_special_token_2|>',
20 | '<|reserved_special_token_3|>',
21 | '<|start_header_id|>',
22 | '<|end_header_id|>',
23 | '<|reserved_special_token_4|>',
24 | '<|eot_id|>',
25 | ] + [
26 | f'<|reserved_special_token_{i}|>'
27 | for i in range(5, nSpecialTokens - 5)
28 | ]
29 | bosId = 128000
30 | eosId = 128001
31 | chatEosId = 128009
32 | chatTemplate = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
33 |
34 | def printUsage():
35 | print('Usage: python convert-tokenizer-llama3.py ')
36 | print()
37 | print('Options:')
38 | print(' The path to the Llama 3 tokenizer model (tokenizer.model)')
39 |
40 | if __name__ == '__main__':
41 | if (len(sys.argv) < 2):
42 | printUsage()
43 | exit(1)
44 |
45 | modelPath = sys.argv[1]
46 | outputFileName = 'dllama_tokenizer_llama3.t'
47 |
48 | with open(modelPath, 'r') as inputFile:
49 | with open(outputFileName, 'wb') as outputFile:
50 | inputLines = inputFile.readlines()
51 | nLines = len(inputLines)
52 |
53 | tokens = []
54 | scores = []
55 | for line in inputLines:
56 | s = line.split(' ')
57 | bytes = base64.b64decode(s[0])
58 | score = -float(s[1])
59 | tokens.append(bytes)
60 | scores.append(score)
61 |
62 | specialTokenIndex = nLines
63 | for token in specialTokens:
64 | bytes = token.encode('utf-8')
65 | score = -float(specialTokenIndex)
66 | tokens.append(bytes)
67 | scores.append(score)
68 | specialTokenIndex += 1
69 |
70 | writer.writeTokenizer(outputFile, {
71 | 'bos_id': bosId,
72 | 'eos_id': eosId,
73 | 'chat_eos_id': chatEosId,
74 | }, tokens, scores, chatTemplate.encode('utf-8'), None)
75 |
76 | print(f'✅ Created {outputFileName}')
77 |
--------------------------------------------------------------------------------
/converter/requirements.txt:
--------------------------------------------------------------------------------
1 | python>=3.9
2 | numpy==1.23.5
3 | pytorch==2.0.1
4 | safetensors==0.4.2
5 | sentencepiece==0.1.99
--------------------------------------------------------------------------------
/converter/tokenizer-writer.py:
--------------------------------------------------------------------------------
1 | import struct
2 |
3 | def writeTokenizer(file, params, tokens, scores, chatTemplate, chatExtraStop):
4 | assert(params['eos_id'] is not None)
5 | assert(params['bos_id'] is not None)
6 |
7 | headerKeys = {
8 | 'version': 0,
9 | 'vocab_size': 1,
10 | 'max_token_length': 2,
11 | 'bos_id': 3,
12 | 'eos_id': 4,
13 | 'pad_id': 5,
14 | 'chat_eos_id': 6,
15 | 'chat_template': 7,
16 | 'chat_stop': 8
17 | }
18 | header = struct.pack('i', 0x567124)
19 |
20 | nTokens = len(tokens)
21 | maxTokenLength = max(len(t) for t in tokens)
22 |
23 | params['version'] = 1
24 | params['vocab_size'] = nTokens
25 | params['max_token_length'] = maxTokenLength
26 | if (chatTemplate):
27 | params['chat_template'] = len(chatTemplate)
28 | if (chatExtraStop):
29 | params['chat_stop'] = len(chatExtraStop)
30 |
31 | data = b''
32 | for key in params:
33 | value = params[key]
34 | if value is None:
35 | continue
36 | if key in headerKeys:
37 | data += struct.pack('ii', headerKeys[key], params[key])
38 | else:
39 | print(f'Unknown header key: {key}')
40 |
41 | print('⭐ Params:')
42 | print(params)
43 | if (chatTemplate):
44 | print('⭐ Chat template:')
45 | print(chatTemplate)
46 |
47 | header += struct.pack('i', len(header) * 2 + len(data))
48 | file.write(header)
49 | file.write(data)
50 | if chatTemplate:
51 | file.write(chatTemplate)
52 | if chatExtraStop:
53 | file.write(chatExtraStop)
54 |
55 | for i in range(0, nTokens):
56 | size = len(tokens[i])
57 | assert(size > 0)
58 | file.write(struct.pack('fI', scores[i], size))
59 | file.write(tokens[i])
60 |
--------------------------------------------------------------------------------
/converter/writer-test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import time
3 | import torch
4 | from writer import writeQuantizedQ40Tensor
5 |
6 | TEMP_FILE_NAME = 'writer-test.temp'
7 |
8 | def readBase64FromFile(path):
9 | with open(path, 'rb') as file:
10 | return file.read().hex()
11 |
12 | def testWriteQuantizedQ40Tensor():
13 | EXPECTED_OUTPUT = '7e346345a692b89665b2c5790537876e598aaa366d988876a898b8d788a98868ce660c66f6b3a88cba5ce9a871987ba9cc5bcaaa760c1eb556a4455b747b6b9504968828ef2a8d7c1db5c6be3764799e66db6d8e76463126a30e4333cad7a4f645947c6cf97f9de086d468c8d535a6ba7dc799d3d0c657bab6799468cad8bb349eb7d7635c7c798998696bb38e4085a9eb34444ba96a7f8ba7b2b42d746a96cf9660aeb4499d8708ad5c7b9a7558947645f3bbb6b0346a656887ad9a86059baac5c596ab781c703569bb8a4356a4bd58cb78736ba09759bb0e34a6274e827b957d7a67dfa86846955660d234b6d9d78a378094a8a8708a7a774ae92f8a36b8c999a9b77a7d958a69747c807963941235379886d69a7a8767b3a6a4ac71999760'
14 |
15 | torch.manual_seed(seed=1)
16 | tensor = torch.randn(32, 16)
17 |
18 | with open(TEMP_FILE_NAME, 'wb') as file:
19 | writeQuantizedQ40Tensor(file, tensor)
20 |
21 | contentBase64 = readBase64FromFile(TEMP_FILE_NAME)
22 | assert contentBase64 == EXPECTED_OUTPUT, f'Received: {contentBase64}'
23 | print('✅ writeQuantizedQ40Tensor')
24 |
25 | def runWriteQuantizedQ40TensorBenchmark():
26 | tensor = torch.randn(8192, 4096)
27 | t0 = time.time()
28 | with open(TEMP_FILE_NAME, 'wb') as file:
29 | writeQuantizedQ40Tensor(file, tensor)
30 | t1 = time.time()
31 | print(f'🕐 writeQuantizedQ40Tensor: {t1 - t0:.4f}s')
32 |
33 | if __name__ == '__main__':
34 | testWriteQuantizedQ40Tensor()
35 | runWriteQuantizedQ40TensorBenchmark()
36 |
--------------------------------------------------------------------------------
/converter/writer.py:
--------------------------------------------------------------------------------
1 | import struct
2 | import torch
3 | import time
4 | import numpy as np
5 |
6 | class FloatType:
7 | F32 = 0
8 | F16 = 1
9 | Q40 = 2
10 | Q80 = 3
11 |
12 | floatTypeMap = {
13 | 'f32': FloatType.F32,
14 | 'f16': FloatType.F16,
15 | 'q40': FloatType.Q40,
16 | 'q80': FloatType.Q80,
17 | }
18 | floatTypeNames = list(floatTypeMap.keys())
19 |
20 | def parseFloatType(type):
21 | floatType = floatTypeMap.get(type)
22 | if floatType is not None:
23 | return floatType
24 | raise Exception(f'{type} is not supported')
25 |
26 | def strFloatType(type):
27 | return floatTypeNames[type]
28 |
29 | def writeQuantizedQ40Tensor(file, x):
30 | x = x.to(torch.float32).numpy().astype(np.float32)
31 | blockSize = 32
32 | blockHalfSize = blockSize // 2
33 | assert(x.shape[0] % blockSize == 0)
34 | groups = x.reshape(-1, blockSize)
35 | gmax = np.max(groups, axis=1)
36 | gmin = np.min(groups, axis=1)
37 | deltas = np.divide(np.where(-gmin > gmax, gmin, gmax), -8)
38 | deltas16 = deltas.astype(np.float16)
39 | ids = np.where(deltas != 0, 1.0 / deltas, 0)
40 | groups = np.add(groups * ids[:, np.newaxis], 8.5)
41 | groups = np.clip(groups, 0, 15).astype(int)
42 |
43 | gLow = groups[:, :blockHalfSize] & 0xF
44 | gHigh = (groups[:, blockHalfSize:] & 0xF) << 4
45 | gCombined = gLow | gHigh
46 |
47 | nBytes = 0
48 | for groupIndex in range(0, len(groups)):
49 | delta16 = deltas16[groupIndex]
50 | buffer = struct.pack(f'e{blockHalfSize}B', delta16, *gCombined[groupIndex])
51 | file.write(buffer)
52 | nBytes += len(buffer)
53 | return nBytes
54 |
55 | def writeQuantizedQ80Tensor(file, x):
56 | x = x.to(torch.float32).numpy().astype(np.float32)
57 | blockSize = 32
58 | assert(x.shape[0] % blockSize == 0)
59 | groups = x.reshape(-1, blockSize)
60 | gmax = np.max(groups, axis=1)
61 | gmin = np.min(groups, axis=1)
62 | gabsMax = np.where(-gmin > gmax, -gmin, gmax)
63 | deltas = gabsMax / ((1 << 7) - 1)
64 | deltas16 = deltas.astype(np.float16)
65 | ids = np.where(deltas != 0, 1.0 / deltas, 0)
66 | groups = groups * ids[:, np.newaxis]
67 | groups8 = np.round(groups).astype(np.int8)
68 |
69 | nBytes = 0
70 | for groupIndex in range(0, len(groups)):
71 | buffer = struct.pack(f'e{blockSize}b', deltas16[groupIndex], *groups8[groupIndex])
72 | file.write(buffer)
73 | nBytes += len(buffer)
74 | return nBytes
75 |
76 | def writeF32Tensor(file, d):
77 | chunkSize = 10000
78 | nBytes = 0
79 | for i in range(0, len(d), chunkSize):
80 | chunk = d[i:i+chunkSize].to(torch.float32).numpy().astype(np.float32)
81 | b = struct.pack(f'{len(chunk)}f', *chunk)
82 | nBytes += len(b)
83 | file.write(b)
84 | return nBytes
85 |
86 | def writeF16Tensor(file, d):
87 | d = d.to(torch.float16).numpy().astype(np.float16)
88 | b = struct.pack(f'{len(d)}e', *d)
89 | file.write(b)
90 | return len(b)
91 |
92 | def writeTensor(file, tensor, floatType):
93 | d = tensor.detach().cpu().view(-1)
94 | t0 = time.time()
95 | nBytes = 0
96 | if (floatType == FloatType.F16):
97 | nBytes = writeF16Tensor(file, d)
98 | elif (floatType == FloatType.F32):
99 | nBytes = writeF32Tensor(file, d)
100 | elif (floatType == FloatType.Q40):
101 | nBytes = writeQuantizedQ40Tensor(file, d)
102 | elif (floatType == FloatType.Q80):
103 | nBytes = writeQuantizedQ80Tensor(file, d)
104 | else:
105 | raise Exception(f'Unknown float type')
106 | t1 = time.time()
107 | print(f'Saved {strFloatType(floatType)} tensor in {t1 - t0:.2f}s, {nBytes} bytes')
108 |
109 | def writeHeader(file, params):
110 | headerKeys = {
111 | 'version': 0,
112 | 'arch_type': 1,
113 | 'dim': 2,
114 | 'hidden_dim': 3,
115 | 'n_layers': 4,
116 | 'n_heads': 5,
117 | 'n_kv_heads': 6,
118 | 'n_experts': 7,
119 | 'n_active_experts': 8,
120 | 'vocab_size': 9,
121 | 'max_seq_len': 10,
122 | 'hidden_act': 11,
123 | 'rope_theta': 12,
124 | 'weights_float_type': 13,
125 | 'rope_scaling_factor': 14,
126 | 'rope_scaling_low_freq_factor': 15,
127 | 'rope_scaling_high_freq_factory': 16,
128 | 'rope_scaling_orig_max_seq_len': 17,
129 | 'rope_type': 18,
130 | }
131 | header = struct.pack('i', 0xA00ABCD)
132 |
133 | data = b''
134 | for key in params:
135 | if key in headerKeys:
136 | data += struct.pack('ii', headerKeys[key], params[key])
137 | else:
138 | print(f'Warning: Unknown header key: {key}')
139 |
140 | header += struct.pack('i', len(header) * 2 + len(data))
141 | file.write(header)
142 | file.write(data)
143 | for key in params:
144 | print(f'🎓 {key}: {params[key]}')
145 | print()
146 |
--------------------------------------------------------------------------------
/docs/HUGGINGFACE.md:
--------------------------------------------------------------------------------
1 | # How to Run Hugging Face 🤗 Model
2 |
3 | Currently, Distributed Llama supports three types of Hugging Face models: `llama`, `mistral`, and `mixtral`. You can try to convert any compatible Hugging Face model and run it with Distributed Llama.
4 |
5 | > [!IMPORTANT]
6 | > All converters are in the early stages of development. After conversion, the model may not work correctly.
7 |
8 | 1. Download a model, for example: [Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3/tree/main).
9 | 2. The downloaded model should contain `config.json`, `tokenizer.json`, `tokenizer_config.json` and `tokenizer.model` and safetensor files.
10 | 3. Run the converter of the model:
11 | ```sh
12 | cd converter
13 | python convert-hf.py path/to/hf/model q40 mistral-7b-0.3
14 | ```
15 | 4. Run the converter of the tokenizer:
16 | ```sh
17 | python convert-tokenizer-hf.py path/to/hf/model mistral-7b-0.3
18 | ```
19 | 5. That's it! Now you can run the Distributed Llama.
20 | ```
21 | ./dllama inference --model dllama_model_mistral-7b-0.3_q40.m --tokenizer dllama_tokenizer_mistral-7b-0.3.t --buffer-float-type q80 --prompt "Hello world"
22 | ```
23 |
--------------------------------------------------------------------------------
/docs/LLAMA.md:
--------------------------------------------------------------------------------
1 | # How to Run Llama
2 |
3 | ## How to Run Llama 2
4 |
5 | 1. Download [Llama 2](https://github.com/facebookresearch/llama) weights from Meta. This project supports 7B, 7B-chat, 13B, 13B-chat, 70B and 70B-chat models.
6 | 2. Open the `llama-2-7b/params.json` file:
7 | * replace `"vocab_size": -1` to `"vocab_size": 32000`,
8 | * add a new property: `"max_seq_len": 2048`.
9 | 3. Install dependencies of the converter:
10 | ```sh
11 | cd converter && pip install -r requirements.txt
12 | ```
13 | 4. Convert weights to Distributed Llama format. This will take a bit of time. The script requires Python 3.
14 | ```sh
15 | python convert-llama.py /path/to/meta/llama-2-7b q40
16 | ```
17 | 5. Download the tokenizer for Llama 2:
18 | ```
19 | wget https://huggingface.co/b4rtaz/Llama-2-Tokenizer-Distributed-Llama/resolve/main/dllama_tokenizer_llama2.t
20 | ```
21 | 6. Build the project:
22 | ```bash
23 | make dllama
24 | make dllama-api
25 | ```
26 | 7. Run:
27 | ```bash
28 | ./dllama inference --model dllama_llama-2-7b_q40.bin --tokenizer dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4
29 | ```
30 |
31 | In the table below, you can find the expected size of the converted weights with different floating-point types.
32 |
33 | | Model | Original size | Float32 | Float16 | Q40 |
34 | |-------------|---------------|----------|----------|----------|
35 | | Llama 2 7B | 13.48 GB | 25.10GB | | 3.95 GB |
36 | | Llama 2 13B | 26.03 GB | | | 7.35 GB |
37 | | Llama 2 70B | 137.97 GB | | | 36.98 GB |
38 |
39 | ## How to Run Llama 3
40 |
41 | 1. Get an access to the model on [Llama 3 website](https://llama.meta.com/llama-downloads).
42 | 2. Clone the `https://github.com/meta-llama/llama3` repository.
43 | 3. Run the `download.sh` script to download the model.
44 | 4. For Llama 3 8B model you should have the following files:
45 | - `Meta-Llama-3-8B/consolidated.00.pth`
46 | - `Meta-Llama-3-8B/params.json`
47 | - `Meta-Llama-3-8B/tokenizer.model`
48 | 5. Open `params.json` and add a new property: `"max_seq_len": 8192`.
49 | 6. Clone the `https://github.com/b4rtaz/distributed-llama.git` repository.
50 | 7. Install dependencies of the converter:
51 | ```sh
52 | cd converter && pip install -r requirements.txt
53 | ```
54 | 8. Convert the model to the Distributed Llama format:
55 | ```bash
56 | python converter/convert-llama.py path/to/Meta-Llama-3-8B q40
57 | ```
58 | 9. Convert the tokenizer to the Distributed Llama format:
59 | ```bash
60 | python converter/convert-tokenizer-llama3.py path/to/tokenizer.model
61 | ```
62 | 10. Build the project:
63 | ```bash
64 | make dllama
65 | make dllama-api
66 | ```
67 | 11. Run the Distributed Llama:
68 | ```bash
69 | ./dllama inference --weights-float-type q40 --buffer-float-type q80 --prompt "My name is" --steps 128 --nthreads 8 --model dllama_meta-llama-3-8b_q40.bin --tokenizer llama3-tokenizer.t
70 | ```
71 |
--------------------------------------------------------------------------------
/examples/chat-api-client.js:
--------------------------------------------------------------------------------
1 | // This is a simple client for dllama-api.
2 | //
3 | // Usage:
4 | //
5 | // 1. Start the server, how to do it is described in the `src/apps/dllama-api/README.md` file.
6 | // 2. Run this script: `node examples/chat-api-client.js`
7 |
8 | const HOST = process.env.HOST ? process.env.HOST : '127.0.0.1';
9 | const PORT = process.env.PORT ? Number(process.env.PORT) : 9999;
10 |
11 | async function chat(messages, maxTokens) {
12 | const response = await fetch(`http://${HOST}:${PORT}/v1/chat/completions`, {
13 | method: 'POST',
14 | headers: {
15 | 'Content-Type': 'application/json',
16 | },
17 | body: JSON.stringify({
18 | messages,
19 | temperature: 0.7,
20 | stop: ['<|eot_id|>'],
21 | max_tokens: maxTokens
22 | }),
23 | });
24 | return await response.json();
25 | }
26 |
27 | async function ask(system, user, maxTokens) {
28 | console.log(`> system: ${system}`);
29 | console.log(`> user: ${user}`);
30 | const response = await chat([
31 | {
32 | role: 'system',
33 | content: system
34 | },
35 | {
36 | role: 'user',
37 | content: user
38 | }
39 | ], maxTokens);
40 | console.log(response.usage);
41 | console.log(response.choices[0].message.content);
42 | }
43 |
44 | async function main() {
45 | await ask('You are an excellent math teacher.', 'What is 1 + 2?', 128);
46 | await ask('You are a romantic.', 'Where is Europe?', 128);
47 | }
48 |
49 | main();
50 |
--------------------------------------------------------------------------------
/examples/macbeth.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This is a simple test of generating a sequence that fulfills the KV cache.
4 | #
5 | # Used model & tokenizer: https://huggingface.co/b4rtaz/llama-3-8b-distributed-llama
6 | # Probably, this test will be working correctly only on MacBook Pro M1, due to differences in float multiplication on different CPUs.
7 |
8 | cd "$(dirname "$0")"
9 | cd ..
10 |
11 | # Source: https://www.opensourceshakespeare.org/views/plays/play_view.php?WorkID=macbeth&Scope=entire
12 | PROMPT="Duncan. What bloody man is that? He can report,
13 | As seemeth by his plight, of the revolt
14 | The newest state. 20
15 |
16 | Malcolm. This is the sergeant
17 | Who like a good and hardy soldier fought
18 | 'Gainst my captivity. Hail, brave friend!
19 | Say to the king the knowledge of the broil
20 | As thou didst leave it. 25
21 |
22 | Sergeant. Doubtful it stood;
23 | As two spent swimmers, that do cling together
24 | And choke their art. The merciless Macdonwald—
25 | Worthy to be a rebel, for to that
26 | The multiplying villanies of nature 30
27 | Do swarm upon him—from the western isles
28 | Of kerns and gallowglasses is supplied;
29 | And fortune, on his damned quarrel smiling,
30 | Show'd like a rebel's whore: but all's too weak:
31 | For brave Macbeth—well he deserves that name— 35
32 | Disdaining fortune, with his brandish'd steel,
33 | Which smoked with bloody execution,
34 | Like valour's minion carved out his passage
35 | Till he faced the slave;
36 | Which ne'er shook hands, nor bade farewell to him, 40
37 | Till he unseam'd him from the nave to the chaps,
38 | And fix'd his head upon our battlements.
39 |
40 | Duncan. O valiant cousin! worthy gentleman!
41 |
42 | Sergeant. As whence the sun 'gins his reflection
43 | Shipwrecking storms and direful thunders break, 45
44 | So from that spring whence comfort seem'd to come
45 | Discomfort swells. Mark, king of Scotland, mark:
46 | No sooner justice had with valour arm'd
47 | Compell'd these skipping kerns to trust their heels,
48 | But the Norweyan lord surveying vantage, 50
49 | With furbish'd arms and new supplies of men
50 | Began a fresh assault.
51 |
52 | Duncan. Dismay'd not this
53 | Our captains, Macbeth and Banquo?
54 |
55 | Sergeant. Yes; 55
56 | As sparrows eagles, or the hare the lion.
57 | If I say sooth, I must report they were
58 | As cannons overcharged with double cracks, so they
59 | Doubly redoubled strokes upon the foe:
60 | Except they meant to bathe in reeking wounds, 60
61 | Or memorise another Golgotha,
62 | I cannot tell.
63 | But I am faint, my gashes cry for help.
64 |
65 | Duncan. So well thy words become thee as thy wounds;
66 | They smack of honour both. Go get him surgeons. 65
67 | [Exit Sergeant, attended]
68 | Who comes here?"
69 |
70 | GENERATED="Malcolm. The worthy Thane of Ross.
71 | Duncan. What a haste looks through a duel's wounds! 70
72 | Some must be pac'd.
73 | [Exit Ross]
74 | See this encounter is like to the poring
75 | On of a beggar's story, told by one
76 | That means to pluck upon the heart the strings
77 | And draw the tears thriftily. 75
78 | [Enter Lennox]
79 | How goes the night, boy?
80 |
81 | Lennox. The night is long that none should wake.
82 |
83 | Duncan. You do not need to stare. The Moor
84 | To know the man. 'Tis the Moors devices. 80
85 | [Exit Lennox]
86 | By the happy right of mine own hands,
87 | Strike all that live in this poor thing of mine.
88 | 'Tis calld the Eyrie, and I am sick at heart.
89 | As hellish-devils do the damned souls
90 | O'their bad lives, thus ill-breveted, linger
91 | O'er lamps and forks and other instruments
92 | That prove the stages of the night. 90
93 | Good sir, take note; I bid you farewell:
94 | Come sleep, and cut short this nitty romance.
95 | [He sleeps.]
96 | If cravens, I bear them like the Minion of the moon,
97 | With tiptoe foot he sneaks and starts to be a man. 95
98 | And when he is found asleep, awake him with this armed' s address:
99 | That sleep which th'assassin hallowed,
100 | Scotland, awake; your king is murder'd, sleep no more. 100
101 | *Furbish'd. Weapons polished for battle.
102 | *Thriftily. Fastidiously, thoughtfully.
103 | *Eyrie. Fortress; the lair of birds of prey.
104 | *Minion. A braggart, a coward.
105 |
106 | 1.5
107 |
108 | Macbeth. So foul and fair a day I have not seen. 5
109 | Ross. Good morning, noble Macbeth. I come from Inverness,
110 | And find our throne void, the arm'd rest you; 10
111 | My Lord of Cassil has resigned his life.
112 | Macbeth. Whate'er you owe, in time repay, fair friends.
113 | Note you the words; I pray you do.
114 | Ross. I am your faithful servant, and will keep
115 | My sworn reward upon your life; my lord.
116 | Macbeth. You shall be well rewarded; stay the press, 20
117 | And I'll not fail. How now, good fellow?
118 | Servant. Sir, his schoolmaster. 25
119 | Macbeth. Well, good, though, old.
120 | Tell me, good fellow, how goes the night? 30
121 | Servant. There's marrygold and fire in your veins, my lord.
122 | Macbeth. He does commend you; the weight of this old night's embargoes 35
123 | Did one hour's waste of time lay upon him.
124 | I know when we are too safe, 'tis dangerous to be secure;
125 | Therefore our fearful parts do brave the danger 40
126 | Which knows it not. I see you are a gentleman.
127 | And a laudable one too; I am most off obliged.
128 | Servant. I should be sorry, my good lord, to have had the labour 45
129 | To outlive this damned hour. 50
130 | Macbeth. What's done cannot be undone. To bed, to bed, to bed.
131 | Servant. Will it please you to lie still? 55
132 | Macbeth. Lord, lord, my heart is in my mouth. All's true that ends well.
133 | Servant. I thank you, fair, and leave you to the content. 60
134 | Macbeth. You see, my lord, it smokes, and shows no cause
135 | Why the drone dies. 65
136 | Servant. Grief fills the room up of one vast stair,
137 | And downs our vaults to the inconstant man above. 70
138 | Macbeth. Go bid thy masters and thy mistress say, 75
139 | I have power in earth to do so much.
140 | There's comfort yet. They are assailable. Then say I,
141 | Thus ye may answer.
142 | Servant. He cannot be wronged; or being wronged, 80
143 | I cannot help him. 85
144 | Macbeth. You know but by this; as this, 90
145 | The Jew foole is hang'd. 95
146 | Servant. No more today, my lord. 100
147 | Macbeth. He does shame to tell him he loves him, but not remove him 105
148 | From his true place; no.
149 | Servant. That's true, and now I remember the story 110
150 | Of that sign in Leo four diurnal courses
151 | Returning in a constant motion were within 115
152 | A boare that had on Taurus' back tetracted; 120
153 | Or neuer, or but once in modulated accidence. 125
154 | Macbeth. Thou climd'st alone, ty'd to the stag's horn.
155 | Servant. I was a bull, for this the goodly year. 130
156 | Come, put me in my place.
157 | Macbeth. Now go to sleep. 135
158 | Servant. The west neuer sett before the equinox 140
159 | Till now; and sunnes look'd not theyr frequencie 145
160 | Upon our lappe till now, my lord. 150
161 | Macbeth. This game of chance you term a gong.
162 | Servant. A gong is a scotch word for an egg. 155
163 | Macbeth. Peace, be still. 160
164 | Servant. I coniecture I smell the blood of an Englishman. 165
165 | Macbeth. The faith is murthered.
166 | Servant. That murder'd in his sleep. 170
167 | Macbeth. And sleeping murdered. 175
168 | Servant. In the fair queen heere in his royal court. 180
169 | Macbeth. So great a mercy that it may last eternally.
170 | Servant. The earth hath bubbles as the water hath, 185
171 | And these are of them. Whate'er we will do 190
172 | To mend the trespasses of the comming time 195
173 | Shall be the seedes of new mischefe, and shall beget 200
174 | The formes of the extinctnese, which we are now. 205
175 | Macbeth. We have scorch'd the snake, not kill'd it. 210
176 | Servant. They hunt it in the morn. Good gally, good lord! 215
177 | It weares a gilded snout. 220
178 | Macbeth. It is the very painting of your fear. 225
179 | Servant. This is the worst. 230
180 | Macbeth. A fair quater of a mile is yet to go. 235
181 | Servant. A mile and half. 240
182 | Macbeth. I have run fifteen miles to-day.
183 | Servant. A calender's date.
184 | Macbeth. A bigger patch, a bigger patch. 245
185 | Servant. Thirteen of more. 250
186 | Macbeth. Wast thou with him? 255
187 | Servant. No, nor he to night. 260
188 | Macbeth. Thou seest the moon"
189 |
190 | echo "Generating, it can take a while..."
191 |
192 | OUTPUT=$(( ./dllama generate --seed 12345 --temperature 0.9 --topp 0.9 --prompt "$PROMPT" --weights-float-type q40 --buffer-float-type f32 --nthreads 2 --steps 2048 --model models/llama3_8b_q40/dllama_model_llama3_8b_q40.m --tokenizer models/llama3_8b_q40/dllama_tokenizer_llama3_8b_q40.t --workers 127.0.0.1:9999 127.0.0.1:9998 127.0.0.1:9997 ) 2>&1)
193 |
194 | echo "$OUTPUT"
195 |
196 | if [[ $OUTPUT == *"$GENERATED"* ]]; then
197 | echo "✅ Output is same"
198 | else
199 | echo "❌ Output is different"
200 | fi
201 |
--------------------------------------------------------------------------------
/examples/n-workers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script starts N workers from a single command. Mainly useful for testing and debugging.
4 | # Usage:
5 | #
6 | # W=7 T=2 bash n-workers.sh start
7 | # W=7 bash n-workers.sh stop
8 | #
9 | # Env vars:
10 | # W - n workers
11 | # T - n threads per worker
12 |
13 | cd "$(dirname "$0")"
14 |
15 | if [ -z "$W" ]; then
16 | W=3
17 | fi
18 | if [ -z "$T" ]; then
19 | T=1
20 | fi
21 |
22 | if [ "$1" == "start" ]; then
23 | for (( w = 0; w < $W ; w += 1 ));
24 | do
25 | PORT=$(expr 9999 - $w)
26 | PROC_ID=$(lsof -ti:$PORT)
27 | if [ -n "$PROC_ID" ]; then
28 | kill -9 $PROC_ID
29 | echo "Killed process $PROC_ID"
30 | fi
31 |
32 | mkdir -p dllama_worker_$w # macOs does not support -Logfile argument, so we place logs inside different directories
33 | cd dllama_worker_$w
34 | screen -d -L -S dllama_worker_$w -m ../../dllama worker --port $PORT --nthreads $T
35 | cd ..
36 | echo "Started worker $w on port $PORT"
37 | done
38 |
39 | sleep 2
40 | elif [ "$1" == "stop" ]; then
41 | for (( w = 0; w < $W ; w += 1 ));
42 | do
43 | screen -S dllama_worker_$w -X quit
44 | done
45 |
46 | echo "Stopped $W workers"
47 | else
48 | echo "Usage: $0 [start|stop]"
49 | fi
50 |
51 | echo "> screen -ls"
52 | screen -ls
53 |
--------------------------------------------------------------------------------
/launch.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 | import multiprocessing
5 | from urllib.request import urlopen
6 |
7 | def parts(length):
8 | result = []
9 | for i in range(length):
10 | a = chr(97 + (i // 26))
11 | b = chr(97 + (i % 26))
12 | result.append(a + b)
13 | return result
14 |
15 | # [['model-url-0', 'model-url-1', ...], 'tokenizer-url', 'weights-float-type', 'buffer-float-type', 'model-type']
16 | MODELS = {
17 | 'llama3_1_8b_instruct_q40': [
18 | ['https://huggingface.co/b4rtaz/Llama-3_1-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.1_instruct_q40.m?download=true'],
19 | 'https://huggingface.co/b4rtaz/Llama-3_1-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama_3_1.t?download=true',
20 | 'q40', 'q80', 'chat', '--max-seq-len 4096'
21 | ],
22 | 'llama3_1_405b_instruct_q40': [
23 | list(map(lambda suffix : f'https://huggingface.co/b4rtaz/Llama-3_1-405B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama31_405b_q40_{suffix}?download=true', parts(56))),
24 | 'https://huggingface.co/b4rtaz/Llama-3_1-405B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama_3_1.t?download=true',
25 | 'q40', 'q80', 'chat', '--max-seq-len 4096'
26 | ],
27 | 'llama3_2_1b_instruct_q40': [
28 | ['https://huggingface.co/b4rtaz/Llama-3_2-1B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.2-1b-instruct_q40.m?download=true'],
29 | 'https://huggingface.co/b4rtaz/Llama-3_2-1B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama3_2.t?download=true',
30 | 'q40', 'q80', 'chat', '--max-seq-len 4096'
31 | ],
32 | 'llama3_2_3b_instruct_q40': [
33 | ['https://huggingface.co/b4rtaz/Llama-3_2-3B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.2-3b-instruct_q40.m?download=true'],
34 | 'https://huggingface.co/b4rtaz/Llama-3_2-3B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama3_2.t?download=true',
35 | 'q40', 'q80', 'chat', '--max-seq-len 4096'
36 | ],
37 | 'llama3_3_70b_instruct_q40': [
38 | list(map(lambda suffix : f'https://huggingface.co/b4rtaz/Llama-3_3-70B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama-3.3-70b_q40{suffix}?download=true', parts(11))),
39 | 'https://huggingface.co/b4rtaz/Llama-3_3-70B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama-3.3-70b.t?download=true',
40 | 'q40', 'q80', 'chat', '--max-seq-len 4096'
41 | ],
42 | 'deepseek_r1_distill_llama_8b_q40': [
43 | ['https://huggingface.co/b4rtaz/DeepSeek-R1-Distill-Llama-8B-Distributed-Llama/resolve/main/dllama_model_deepseek-r1-distill-llama-8b_q40.m?download=true'],
44 | 'https://huggingface.co/b4rtaz/DeepSeek-R1-Distill-Llama-8B-Distributed-Llama/resolve/main/dllama_tokenizer_deepseek-r1-distill-llama-8b.t?download=true',
45 | 'q40', 'q80', 'chat', '--max-seq-len 4096'
46 | ],
47 | }
48 |
49 | def confirm(message: str):
50 | result = input(f'❓ {message} ("Y" if yes): ').upper()
51 | return result == 'Y' or result == 'YES'
52 |
53 | def downloadFile(urls, path: str):
54 | if os.path.isfile(path):
55 | fileName = os.path.basename(path)
56 | if not confirm(f'{fileName} already exists, do you want to download again?'):
57 | return
58 |
59 | lastSizeMb = 0
60 | with open(path, 'wb') as file:
61 | for url in urls:
62 | startPosition = file.tell()
63 | success = False
64 | for attempt in range(8):
65 | print(f'📄 {url} (attempt: {attempt})')
66 | try:
67 | with urlopen(url) as response:
68 | while True:
69 | chunk = response.read(4096)
70 | if not chunk:
71 | break
72 | file.write(chunk)
73 | sizeMb = file.tell() // (1024 * 1024)
74 | if sizeMb != lastSizeMb:
75 | sys.stdout.write("\rDownloaded %i MB" % sizeMb)
76 | lastSizeMb = sizeMb
77 | sys.stdout.write('\n')
78 | success = True
79 | break
80 | except Exception as e:
81 | print(f'\n❌ Error downloading {url}: {e}')
82 | file.seek(startPosition)
83 | file.truncate()
84 | time.sleep(1 * attempt)
85 | if not success:
86 | raise Exception(f'Failed to download {url}')
87 | sys.stdout.write(' ✅\n')
88 |
89 | def download(modelName: str, model: list):
90 | dirPath = os.path.join('models', modelName)
91 | print(f'📀 Downloading {modelName} to {dirPath}...')
92 | os.makedirs(dirPath, exist_ok=True)
93 | modelUrls = model[0]
94 | tokenizerUrl = model[1]
95 | modelPath = os.path.join(dirPath, f'dllama_model_{modelName}.m')
96 | tokenizerPath = os.path.join(dirPath, f'dllama_tokenizer_{modelName}.t')
97 | downloadFile(modelUrls, modelPath)
98 | downloadFile([tokenizerUrl], tokenizerPath)
99 | print('📀 All files are downloaded')
100 | return (modelPath, tokenizerPath)
101 |
102 | def writeRunFile(modelName: str, command: str):
103 | filePath = f'run_{modelName}.sh'
104 | with open(filePath, 'w') as file:
105 | file.write('#!/bin/sh\n')
106 | file.write('\n')
107 | file.write(f'{command}\n')
108 | return filePath
109 |
110 | def printUsage():
111 | print('Usage: python download-model.py ')
112 | print()
113 | print('Options:')
114 | print(' The name of the model to download')
115 | print(' --run Run the model after download')
116 | print()
117 | print('Available models:')
118 | for model in MODELS:
119 | print(f' {model}')
120 |
121 | if __name__ == '__main__':
122 | if (len(sys.argv) < 2):
123 | printUsage()
124 | exit(1)
125 |
126 | os.chdir(os.path.dirname(__file__))
127 |
128 | modelName = sys.argv[1].replace('-', '_')
129 | if modelName not in MODELS:
130 | print(f'Model is not supported: {modelName}')
131 | exit(1)
132 | runAfterDownload = sys.argv.count('--run') > 0
133 |
134 | model = MODELS[modelName]
135 | (modelPath, tokenizerPath) = download(modelName, model)
136 |
137 | nThreads = multiprocessing.cpu_count()
138 | if (model[4] == 'chat'):
139 | command = './dllama chat'
140 | else:
141 | command = './dllama inference --steps 64 --prompt "Hello world"'
142 | command += f' --model {modelPath} --tokenizer {tokenizerPath} --buffer-float-type {model[3]} --nthreads {nThreads}'
143 | if (len(model) > 5):
144 | command += f' {model[5]}'
145 |
146 | print('To run Distributed Llama you need to execute:')
147 | print('--- copy start ---')
148 | print()
149 | print('\033[96m' + command + '\033[0m')
150 | print()
151 | print('--- copy end -----')
152 |
153 | runFilePath = writeRunFile(modelName, command)
154 | print(f'🌻 Created {runFilePath} script to easy run')
155 |
156 | if (not runAfterDownload):
157 | runAfterDownload = confirm('Do you want to run Distributed Llama?')
158 | if (runAfterDownload):
159 | if (not os.path.isfile('dllama')):
160 | os.system('make dllama')
161 | os.system(command)
162 |
--------------------------------------------------------------------------------
/report/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/b4rtaz/distributed-llama/a16d2f03e66437088dce2ba4b82304a8101c074f/report/report.pdf
--------------------------------------------------------------------------------
/src/api-types.hpp:
--------------------------------------------------------------------------------
1 | #ifndef API_TYPES_HPP
2 | #define API_TYPES_HPP
3 |
4 | #include
5 |
6 | #include "json.hpp"
7 |
8 | using json = nlohmann::json;
9 |
10 | struct ChatMessageDelta {
11 | std::string role;
12 | std::string content;
13 |
14 | ChatMessageDelta() : role(""), content("") {}
15 | ChatMessageDelta(const std::string& role_, const std::string& content_) : role(role_), content(content_) {}
16 | };
17 |
18 | struct ChatMessage {
19 | std::string role;
20 | std::string content;
21 |
22 | ChatMessage() : role(""), content("") {}
23 | ChatMessage(const std::string& role_, const std::string& content_) : role(role_), content(content_) {}
24 | };
25 |
26 | struct ChunkChoice {
27 | int index;
28 | ChatMessageDelta delta;
29 | std::string finish_reason;
30 |
31 | ChunkChoice() : index(0) {}
32 | };
33 |
34 |
35 | struct Choice {
36 | int index;
37 | ChatMessage message;
38 | std::string finish_reason;
39 |
40 | Choice() : finish_reason("") {}
41 | Choice(ChatMessage &message_) : message(message_), finish_reason("") {}
42 | Choice(const std::string &reason_) : finish_reason(reason_) {}
43 | };
44 |
45 | struct ChatCompletionChunk {
46 | std::string id;
47 | std::string object;
48 | long long created;
49 | std::string model;
50 | std::vector choices;
51 |
52 | ChatCompletionChunk(ChunkChoice &choice_)
53 | : id("cmpl-c0"), object("chat.completion"), model("Distributed Model") {
54 | created = std::time(nullptr); // Set created to current Unix timestamp
55 | choices.push_back(choice_);
56 | }
57 | };
58 |
59 | // Struct to represent the usage object
60 | struct ChatUsage {
61 | int prompt_tokens;
62 | int completion_tokens;
63 | int total_tokens;
64 |
65 | ChatUsage() : prompt_tokens(0), completion_tokens(0), total_tokens(0) {}
66 | ChatUsage(int pt, int ct, int tt) : prompt_tokens(pt), completion_tokens(ct), total_tokens(tt) {}
67 | };
68 |
69 | struct ChatCompletion {
70 | std::string id;
71 | std::string object;
72 | long long created; // Unix timestamp
73 | std::string model;
74 | std::vector choices;
75 | ChatUsage usage;
76 |
77 | ChatCompletion() : id(), object(), model() {}
78 | ChatCompletion(const Choice &choice_, const ChatUsage& usage_)
79 | : id("cmpl-j0"), object("chat.completion"), model("Distributed Model"), usage(usage_) {
80 | created = std::time(nullptr); // Set created to current Unix timestamp
81 | choices.push_back(choice_);
82 | }
83 | };
84 |
85 | struct Model {
86 | std::string id;
87 | std::string object;
88 | long long created;
89 | std::string owned_by;
90 |
91 | Model() : id(), object(), created(0), owned_by() {}
92 | Model(const std::string &id_) : id(id_), object("model"), created(0), owned_by("user") {}
93 | };
94 |
95 | struct ModelList {
96 | std::string object;
97 | std::vector data;
98 | ModelList(): object("list") {}
99 | ModelList(const Model &model_) : object("list") {
100 | data.push_back(model_);
101 | }
102 | };
103 |
104 | struct InferenceParams {
105 | std::vector messages;
106 | int max_tokens;
107 | float temperature;
108 | float top_p;
109 | std::vector stop;
110 | bool stream;
111 | unsigned long long seed;
112 | };
113 |
114 | // Define to_json for Delta struct
115 | void to_json(json& j, const ChatMessageDelta& msg) {
116 | j = json{{"role", msg.role}, {"content", msg.content}};
117 | }
118 |
119 | void to_json(json& j, const ChatMessage& msg) {
120 | j = json{{"role", msg.role}, {"content", msg.content}};
121 | }
122 |
123 | void to_json(json& j, const ChunkChoice& choice) {
124 | j = json{{"index", choice.index}, {"delta", choice.delta}, {"finish_reason", choice.finish_reason}};
125 | }
126 |
127 | void to_json(json& j, const Choice& choice) {
128 | j = json{{"index", choice.index}, {"message", choice.message}, {"finish_reason", choice.finish_reason}};
129 | }
130 |
131 | void to_json(json& j, const ChatCompletionChunk& completion) {
132 | j = json{{"id", completion.id},
133 | {"object", completion.object},
134 | {"created", completion.created},
135 | {"model", completion.model},
136 | {"choices", completion.choices}};
137 | }
138 |
139 | void to_json(json& j, const ChatUsage& usage) {
140 | j = json{{"completion_tokens", usage.completion_tokens},
141 | {"prompt_tokens", usage.prompt_tokens},
142 | {"total_tokens", usage.total_tokens}};
143 | }
144 |
145 | void to_json(json& j, const ChatCompletion& completion) {
146 | j = json{{"id", completion.id},
147 | {"object", completion.object},
148 | {"created", completion.created},
149 | {"model", completion.model},
150 | {"usage", completion.usage},
151 | {"choices", completion.choices}};
152 | }
153 |
154 | void to_json(json& j, const Model& model) {
155 | j = json{{"id", model.id},
156 | {"object", model.object},
157 | {"created", model.created},
158 | {"owned_by", model.owned_by}};
159 | }
160 |
161 | void to_json(json& j, const ModelList& models) {
162 | j = json{{"object", models.object},
163 | {"data", models.data}};
164 | }
165 |
166 | std::vector parseChatMessages(json &json){
167 | std::vector messages;
168 | messages.reserve(json.size());
169 |
170 | for (const auto& item : json) {
171 | messages.emplace_back(
172 | item["role"].template get(),
173 | item["content"].template get()
174 | );
175 | }
176 | return messages;
177 | }
178 |
179 | #endif
180 |
--------------------------------------------------------------------------------
/src/app.hpp:
--------------------------------------------------------------------------------
1 | #ifndef APP_HPP
2 | #define APP_HPP
3 |
4 | #include
5 | #include "nn/nn-core.hpp"
6 | #include "nn/nn-cpu.hpp"
7 | #include "tokenizer.hpp"
8 | #include "llm.hpp"
9 |
10 | class AppCliArgs {
11 | public:
12 | char *mode;
13 | NnUint nThreads;
14 | NnUint nBatches;
15 | bool help;
16 |
17 | // inference
18 | char *modelPath;
19 | char *tokenizerPath;
20 | char *prompt;
21 | NnFloatType syncType;
22 | NnUint nWorkers;
23 | char **workerHosts;
24 | NnUint *workerPorts;
25 | float temperature;
26 | float topp;
27 | NnUint steps;
28 | bool benchmark;
29 | unsigned long long seed;
30 | ChatTemplateType chatTemplateType;
31 | NnUint maxSeqLen;
32 | bool netTurbo;
33 | int gpuIndex;
34 | int gpuSegmentFrom;
35 | int gpuSegmentTo;
36 |
37 | // worker
38 | NnUint port;
39 |
40 | static AppCliArgs parse(int argc, char **argv, bool hasMode);
41 | ~AppCliArgs();
42 | };
43 |
44 | typedef struct {
45 | NnUint position;
46 | NnUint batchSize; // 0 = stop signal
47 | } LlmControlPacket;
48 |
49 | class RootLlmInference {
50 | public:
51 | float *logitsPipe;
52 | private:
53 | float *tokenPipe;
54 | float *positionPipe;
55 | LlmHeader *header;
56 | NnNetExecution *execution;
57 | NnExecutor *executor;
58 | NnNetwork *network;
59 | LlmControlPacket controlPacket;
60 | public:
61 | RootLlmInference(LlmNet *net, NnNetExecution *execution, NnExecutor *executor, NnNetwork *network);
62 | void setBatchSize(NnUint batchSize);
63 | void setPosition(NnUint position);
64 | void setToken(NnUint batchIndex, NnUint token);
65 | void forward();
66 | void finish();
67 | };
68 |
69 | class WorkerLlmInference {
70 | public:
71 | bool isFinished;
72 | private:
73 | float *positionPipe;
74 | NnNetExecution *execution;
75 | NnNetwork *network;
76 | LlmControlPacket controlPacket;
77 | public:
78 | WorkerLlmInference(NnNetExecution *execution, NnNetwork *network);
79 | bool tryReadControlPacket();
80 | };
81 |
82 | typedef struct {
83 | AppCliArgs *args;
84 | LlmHeader *header;
85 | RootLlmInference *inference;
86 | Tokenizer *tokenizer;
87 | Sampler *sampler;
88 | NnNetwork *network;
89 | NnExecutor *executor;
90 | } AppInferenceContext;
91 |
92 | void runInferenceApp(AppCliArgs *args, void (*handler)(AppInferenceContext *context));
93 | void runWorkerApp(AppCliArgs *args);
94 |
95 | #endif
96 |
--------------------------------------------------------------------------------
/src/dllama.cpp:
--------------------------------------------------------------------------------
1 | #include "nn/nn-core.hpp"
2 | #include "nn/nn-config-builder.hpp"
3 | #include "nn/nn-cpu.hpp"
4 | #include "nn/nn-network.hpp"
5 | #include "nn/nn-executor.hpp"
6 | #include "llm.hpp"
7 | #include "tokenizer.hpp"
8 | #include "app.hpp"
9 | #include
10 |
11 | static void inference(AppInferenceContext *context) {
12 | if (context->args->prompt == nullptr)
13 | throw std::runtime_error("Prompt is required");
14 | if (context->args->steps == 0)
15 | throw std::runtime_error("Number of steps is required");
16 |
17 | std::vector inputTokensVec(std::strlen(context->args->prompt) + 3);
18 | int *inputTokens = inputTokensVec.data();
19 |
20 | NnUint pos = 0;
21 | int token;
22 | int nInputTokens;
23 | context->tokenizer->encode(context->args->prompt, inputTokens, &nInputTokens, true, false);
24 |
25 | if (nInputTokens > context->header->seqLen)
26 | throw std::runtime_error("The number of prompt tokens is greater than the sequence length");
27 | if (nInputTokens > context->args->steps)
28 | throw std::runtime_error("The number of prompt tokens is greater than the number of steps");
29 |
30 | NnSize sentBytes = 0;
31 | NnSize recvBytes = 0;
32 | NnUint evalTotalTime = 0;
33 | NnUint predTotalTime = 0;
34 |
35 | printf("%s\n", context->args->prompt);
36 | for (;;) {
37 | long remainingTokens = nInputTokens - 1 - (long)pos;
38 | if (remainingTokens <= 0)
39 | break;
40 | NnUint batchSize = remainingTokens < context->args->nBatches
41 | ? remainingTokens
42 | : context->args->nBatches;
43 |
44 | context->inference->setBatchSize(batchSize);
45 | context->inference->setPosition(pos);
46 | for (NnUint i = 0; i < batchSize; i++)
47 | context->inference->setToken(i, inputTokens[pos + i]);
48 |
49 | context->inference->forward();
50 |
51 | pos += batchSize;
52 | token = inputTokens[pos + 1];
53 |
54 | if (context->network != nullptr)
55 | context->network->getStats(&sentBytes, &recvBytes);
56 |
57 | NnUint evalTime = context->executor->getTotalTime(STEP_EXECUTE_OP);
58 | NnUint syncTime = context->executor->getTotalTime(STEP_SYNC_NODES);
59 | printf("🔷️ Eval%5u ms Sync%5u ms | Sent%6zu kB Recv%6zu kB | (%d tokens)\n",
60 | evalTime / 1000,
61 | syncTime / 1000,
62 | sentBytes / 1024,
63 | recvBytes / 1024,
64 | batchSize);
65 | evalTotalTime += evalTime + syncTime;
66 | }
67 |
68 | fflush(stdout);
69 |
70 | context->inference->setBatchSize(1);
71 | context->tokenizer->resetDecoder();
72 |
73 | const NnUint maxPos = std::min(context->header->seqLen, context->args->steps);
74 | for (; pos < maxPos; pos++) {
75 | context->inference->setPosition(pos);
76 | context->inference->setToken(0, token);
77 | context->inference->forward();
78 |
79 | token = context->sampler->sample(context->inference->logitsPipe);
80 |
81 | char *piece = context->tokenizer->decode(token);
82 |
83 | if (context->network != nullptr)
84 | context->network->getStats(&sentBytes, &recvBytes);
85 |
86 | NnUint predTime = context->executor->getTotalTime(STEP_EXECUTE_OP);
87 | NnUint syncTime = context->executor->getTotalTime(STEP_SYNC_NODES);
88 | printf("🔶 Pred%5u ms Sync%5u ms | Sent%6zu kB Recv%6zu kB | %s\n",
89 | predTime / 1000,
90 | syncTime / 1000,
91 | sentBytes / 1024,
92 | recvBytes / 1024,
93 | piece == nullptr ? "~" : piece);
94 | fflush(stdout);
95 | predTotalTime += predTime + syncTime;
96 | }
97 |
98 | NnUint nEvalTokens = nInputTokens - 1;
99 | NnUint nPredTokens = pos - nEvalTokens;
100 | float evalTotalTimeMs = evalTotalTime / 1000.0;
101 | float predTotalTimeMs = predTotalTime / 1000.0;
102 | printf("\n");
103 | printf("Evaluation\n");
104 | printf(" nBatches: %d\n", context->args->nBatches);
105 | printf(" nTokens: %d\n", nEvalTokens);
106 | printf(" tokens/s: %3.2f (%3.2f ms/tok)\n",
107 | (nEvalTokens * 1000) / evalTotalTimeMs,
108 | evalTotalTimeMs / ((float) nEvalTokens));
109 | printf("Prediction\n");
110 | printf(" nTokens: %d\n", nPredTokens);
111 | printf(" tokens/s: %3.2f (%3.2f ms/tok)\n",
112 | (nPredTokens * 1000) / predTotalTimeMs,
113 | predTotalTimeMs / ((float) nPredTokens));
114 | }
115 |
116 | static NnUint readStdin(const char *guide, char *buffer, NnUint size) {
117 | std::fflush(stdin);
118 | std::printf("%s", guide);
119 | if (std::fgets(buffer, size, stdin) != NULL) {
120 | NnUint length = std::strlen(buffer);
121 | if (length > 0 && buffer[length - 1] == '\n') {
122 | buffer[length - 1] = '\0';
123 | length--;
124 | }
125 | return length;
126 | }
127 | return 0;
128 | }
129 |
130 | static void chat(AppInferenceContext *context) {
131 | const NnUint seqLen = context->header->seqLen;
132 | char prompt[2048];
133 |
134 | TokenizerChatStops stops(context->tokenizer);
135 | ChatTemplateGenerator templateGenerator(context->args->chatTemplateType, context->tokenizer->chatTemplate, stops.stops[0]);
136 | EosDetector eosDetector(stops.nStops, context->tokenizer->eosTokenIds.data(), stops.stops, stops.maxStopLength, stops.maxStopLength);
137 |
138 | const NnUint sysPromptLength = readStdin("💻 System prompt (optional): ", prompt, sizeof(prompt));
139 | std::vector deltaItems;
140 | if (sysPromptLength > 0)
141 | deltaItems.push_back(ChatItem{"system", prompt});
142 |
143 | NnUint pos = 0;
144 | NnUint userPromptLength;
145 | int token;
146 | int nInputTokens;
147 | do {
148 | do {
149 | userPromptLength = readStdin("\n👱 User\n> ", prompt, sizeof(prompt));
150 | } while (userPromptLength == 0);
151 |
152 | deltaItems.push_back(ChatItem{"user", prompt});
153 |
154 | GeneratedChat inputPrompt = templateGenerator.generate(deltaItems.size(), deltaItems.data(), true);
155 | std::unique_ptr inputTokensPtr(new int[inputPrompt.length + 2]);
156 | int *inputTokens = inputTokensPtr.get();
157 |
158 | bool addBos = pos == 0;
159 | context->tokenizer->encode((char*)inputPrompt.content, inputTokens, &nInputTokens, addBos, true);
160 |
161 | NnUint userPromptEndPos = (NnUint)std::min(seqLen, pos + nInputTokens - 1);
162 | for (NnUint i = 0; ;) {
163 | int remainingTokens = userPromptEndPos - pos;
164 | if (remainingTokens <= 0)
165 | break;
166 | NnUint batchSize = remainingTokens < context->args->nBatches
167 | ? remainingTokens
168 | : context->args->nBatches;
169 |
170 | context->inference->setBatchSize(batchSize);
171 | context->inference->setPosition(pos);
172 | for (NnUint j = 0; j < batchSize; j++)
173 | context->inference->setToken(j, inputTokens[i + j]);
174 |
175 | context->inference->forward();
176 |
177 | i += batchSize;
178 | pos += batchSize;
179 | token = inputTokens[i + 1];
180 | }
181 |
182 | context->inference->setBatchSize(1);
183 | context->tokenizer->resetDecoder();
184 |
185 | printf("\n🤖 Assistant\n");
186 | if (inputPrompt.publicPrompt != nullptr)
187 | printf("%s", inputPrompt.publicPrompt);
188 |
189 | while (pos < seqLen) {
190 | context->inference->setPosition(pos);
191 | context->inference->setToken(0, token);
192 | context->inference->forward();
193 |
194 | token = context->sampler->sample(context->inference->logitsPipe);
195 |
196 | char *piece = context->tokenizer->decode(token);
197 | EosDetectorType eosType = eosDetector.append(token, piece);
198 | if (eosType == NOT_EOS || eosType == EOS) {
199 | char *delta = eosDetector.getDelta();
200 | if (delta != nullptr) {
201 | printf("%s", delta);
202 | fflush(stdout);
203 | }
204 | eosDetector.reset();
205 | }
206 | pos++;
207 | if (eosType == EOS) break;
208 | }
209 |
210 | deltaItems.clear();
211 | } while (pos < seqLen);
212 |
213 | printf("(end of context)\n");
214 | }
215 |
216 | int main(int argc, char **argv) {
217 | initQuants();
218 | initSockets();
219 |
220 | int returnCode = EXIT_SUCCESS;
221 | try {
222 | AppCliArgs args = AppCliArgs::parse(argc, argv, true);
223 | if (std::strcmp(args.mode, "inference") == 0) {
224 | args.benchmark = true;
225 | runInferenceApp(&args, &inference);
226 | } else if (std::strcmp(args.mode, "chat") == 0)
227 | runInferenceApp(&args, &chat);
228 | else if (std::strcmp(args.mode, "worker") == 0)
229 | runWorkerApp(&args);
230 | else
231 | throw std::runtime_error("Unsupported mode");
232 | } catch (std::exception &e) {
233 | printf("🚨 Critical error: %s\n", e.what());
234 | returnCode = EXIT_FAILURE;
235 | }
236 |
237 | cleanupSockets();
238 | return returnCode;
239 | }
240 |
--------------------------------------------------------------------------------
/src/llm.hpp:
--------------------------------------------------------------------------------
1 | #ifndef LLM_HPP
2 | #define LLM_HPP
3 |
4 | #include "nn/nn-core.hpp"
5 | #include "nn/nn-executor.hpp"
6 | #include "nn/nn-network.hpp"
7 |
8 | enum LlmHeaderKey {
9 | VERSION = 0,
10 | ARCH_TYPE = 1,
11 | DIM = 2,
12 | HIDDEN_DIM = 3,
13 | N_LAYERS = 4,
14 | N_HEADS = 5,
15 | N_KV_HEADS = 6,
16 | N_EXPERTS = 7,
17 | N_ACTIVE_EXPERTS = 8,
18 | VOCAB_SIZE = 9,
19 | SEQ_LEN = 10,
20 | HIDDEN_ACT = 11,
21 | ROPE_THETA = 12,
22 | WEIGHT_FLOAT_TYPE = 13,
23 | ROPE_SCALING_FACTOR = 14,
24 | ROPE_SCALING_LOW_FREQ_FACTOR = 15,
25 | ROPE_SCALING_HIGH_FREQ_FACTORY = 16,
26 | ROPE_SCALING_ORIG_MAX_SEQ_LEN = 17,
27 | ROPE_TYPE = 18,
28 | };
29 |
30 | enum LlmHiddenAct {
31 | HIDDEN_ACT_GELU,
32 | HIDDEN_ACT_SILU,
33 | };
34 |
35 | enum LlmArchType {
36 | LLAMA = 0xABCD00,
37 | };
38 |
39 | typedef struct {
40 | NnSize headerSize;
41 | NnSize fileSize;
42 | int version;
43 | LlmArchType archType;
44 | NnUint dim;
45 | NnUint nLayers;
46 | NnUint nHeads;
47 | NnUint headSize;
48 | NnUint nKvHeads;
49 | NnUint nExperts;
50 | NnUint nActiveExperts;
51 | NnUint origSeqLen; // Original model context length
52 | NnUint seqLen; // Limited context length by the `--max-seq-len` argument
53 | NnUint hiddenDim;
54 | LlmHiddenAct hiddenAct;
55 | NnUint kvDim;
56 | NnUint vocabSize;
57 | float ropeTheta;
58 | NnRopeType ropeType;
59 | float ropeScalingFactor;
60 | float ropeScalingLowFreqFactor;
61 | float ropeScalingHighFreqFactory;
62 | NnUint ropeScalingOrigMaxSeqLen;
63 | float normEpsilon;
64 |
65 | NnFloatType weightType;
66 | NnFloatType syncType;
67 | } LlmHeader;
68 |
69 | typedef struct {
70 | LlmHeader *header;
71 | NnNetConfig netConfig;
72 | NnNodeConfig *nodeConfigs;
73 | NnRowMatmulSlice qSlice;
74 | NnRowMatmulSlice kSlice;
75 | NnRowMatmulSlice vSlice;
76 | NnColMatmulSlice woSlice;
77 | NnRowMatmulSlice w1Slice;
78 | NnColMatmulSlice w2Slice;
79 | NnRowMatmulSlice w3Slice;
80 | NnRowMatmulSlice wclsSlice;
81 | NnUint positionPipeIndex;
82 | NnUint tokenPipeIndex;
83 | NnUint xPipeIndex;
84 | NnUint logitsPipeIndex;
85 | NnSize2D tokenEmbeddingSize;
86 | NnSize2D rmsNormSize;
87 | } LlmNet;
88 |
89 | LlmHeader loadLlmHeader(const char* path, const unsigned int maxSeqLen, NnFloatType syncType);
90 | void printLlmHeader(LlmHeader *header);
91 | LlmNet buildLlmNet(LlmHeader *h, NnUint nNodes, NnUint nBatches);
92 | void releaseLlmNet(LlmNet *net);
93 | void loadLlmNetWeight(const char* path, LlmNet *net, NnRootWeightLoader *loader);
94 |
95 | #endif
--------------------------------------------------------------------------------
/src/mmap.hpp:
--------------------------------------------------------------------------------
1 | #ifndef MMAP_HPP
2 | #define MMAP_HPP
3 |
4 | #include
5 | #include
6 | #ifdef _WIN32
7 | #include
8 | #else
9 | #include
10 | #include
11 | #include
12 | #endif
13 |
14 | struct MmapFile {
15 | void* data;
16 | size_t size;
17 | #ifdef _WIN32
18 | HANDLE hFile;
19 | HANDLE hMapping;
20 | #else
21 | int fd;
22 | #endif
23 | };
24 |
25 | long seekToEnd(FILE* file) {
26 | #ifdef _WIN32
27 | _fseeki64(file, 0, SEEK_END);
28 | return _ftelli64(file);
29 | #else
30 | fseek(file, 0, SEEK_END);
31 | return ftell(file);
32 | #endif
33 | }
34 |
35 | void openMmapFile(MmapFile *file, const char *path, size_t size) {
36 | file->size = size;
37 | #ifdef _WIN32
38 | file->hFile = CreateFileA(path, GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
39 | if (file->hFile == INVALID_HANDLE_VALUE) {
40 | printf("Cannot open file %s\n", path);
41 | exit(EXIT_FAILURE);
42 | }
43 |
44 | file->hMapping = CreateFileMappingA(file->hFile, NULL, PAGE_READONLY, 0, 0, NULL);
45 | if (file->hMapping == NULL) {
46 | printf("CreateFileMappingA failed, error: %lu\n", GetLastError());
47 | CloseHandle(file->hFile);
48 | exit(EXIT_FAILURE);
49 | }
50 |
51 | file->data = (void *)MapViewOfFile(file->hMapping, FILE_MAP_READ, 0, 0, 0);
52 | if (file->data == NULL) {
53 | printf("MapViewOfFile failed!\n");
54 | CloseHandle(file->hMapping);
55 | CloseHandle(file->hFile);
56 | exit(EXIT_FAILURE);
57 | }
58 | #else
59 | file->fd = open(path, O_RDONLY);
60 | if (file->fd == -1) {
61 | throw std::runtime_error("Cannot open file");
62 | }
63 |
64 | file->data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, file->fd, 0);
65 | if (file->data == MAP_FAILED) {
66 | close(file->fd);
67 | throw std::runtime_error("Mmap failed");
68 | }
69 | #endif
70 | }
71 |
72 | void closeMmapFile(MmapFile *file) {
73 | #ifdef _WIN32
74 | UnmapViewOfFile(file->data);
75 | CloseHandle(file->hMapping);
76 | CloseHandle(file->hFile);
77 | #else
78 | munmap(file->data, file->size);
79 | close(file->fd);
80 | #endif
81 | }
82 |
83 | #endif
--------------------------------------------------------------------------------
/src/nn/llamafile/sgemm.hpp:
--------------------------------------------------------------------------------
1 | #ifndef LLAMAFILE_SGEMM_H
2 | #define LLAMAFILE_SGEMM_H
3 |
4 | #include
5 |
6 | bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
7 | int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype);
8 |
9 | #endif
10 |
--------------------------------------------------------------------------------
/src/nn/nn-config-builder.hpp:
--------------------------------------------------------------------------------
1 | #ifndef NN_CONFIG_BUILDER_H
2 | #define NN_CONFIG_BUILDER_H
3 |
4 | #include "nn-core.hpp"
5 | #include
6 | #include
7 |
8 | static char *cloneString(const char *str) {
9 | NnUint len = std::strlen(str);
10 | char *copy = new char[len + 1];
11 | std::memcpy(copy, str, len + 1);
12 | return copy;
13 | }
14 |
15 | class NnNetConfigBuilder {
16 | public:
17 | NnUint nNodes;
18 | NnUint nBatches;
19 | std::list pipes;
20 | std::list preSyncs;
21 |
22 | NnNetConfigBuilder(NnUint nNodes, NnUint nBatches) {
23 | this->nNodes = nNodes;
24 | this->nBatches = nBatches;
25 | }
26 |
27 | NnUint addPipe(const char *name, NnSize2D size) {
28 | NnUint pipeIndex = pipes.size();
29 | pipes.push_back({ cloneString(name), size });
30 | return pipeIndex;
31 | }
32 |
33 | void addPreSync(NnUint pipeIndex) {
34 | preSyncs.push_back({ pipeIndex });
35 | }
36 |
37 | NnNetConfig build() {
38 | NnNetConfig config;
39 | config.nNodes = nNodes;
40 | config.nBatches = nBatches;
41 | config.nPipes = pipes.size();
42 | config.pipes = new NnPipeConfig[config.nPipes];
43 | std::copy(pipes.begin(), pipes.end(), config.pipes);
44 | config.nPreSyncs = preSyncs.size();
45 | if (config.nPreSyncs > 0) {
46 | config.preSyncs = new NnPreSyncConfig[config.nPreSyncs];
47 | std::copy(preSyncs.begin(), preSyncs.end(), config.preSyncs);
48 | } else {
49 | config.preSyncs = nullptr;
50 | }
51 | return config;
52 | }
53 | };
54 |
55 | class NnNodeConfigBuilder {
56 | public:
57 | NnUint nodeIndex;
58 | std::list buffers;
59 | std::list segments;
60 |
61 | NnNodeConfigBuilder(NnUint nodeIndex) {
62 | this->nodeIndex = nodeIndex;
63 | }
64 |
65 | NnUint addBuffer(const char *name, NnSize2D size) {
66 | NnUint bufferIndex = buffers.size();
67 | buffers.push_back({ cloneString(name), size });
68 | return bufferIndex;
69 | }
70 |
71 | void addSegment(NnSegmentConfig segment) {
72 | segments.push_back(segment);
73 | }
74 |
75 | NnNodeConfig build() {
76 | NnNodeConfig config;
77 | config.nodeIndex = nodeIndex;
78 | config.nBuffers = buffers.size();
79 | if (config.nBuffers > 0) {
80 | config.buffers = new NnBufferConfig[config.nBuffers];
81 | std::copy(buffers.begin(), buffers.end(), config.buffers);
82 | } else {
83 | config.buffers = nullptr;
84 | }
85 |
86 | config.nSegments = segments.size();
87 | assert(config.nSegments > 0);
88 | config.segments = new NnSegmentConfig[config.nSegments];
89 | std::copy(segments.begin(), segments.end(), config.segments);
90 | return config;
91 | }
92 | };
93 |
94 | class NnSegmentConfigBuilder {
95 | private:
96 | std::list ops;
97 | std::list syncs;
98 |
99 | public:
100 | template
101 | void addOp(NnOpCode code, const char *name, NnUint index, NnPointerConfig input, NnPointerConfig output, NnSize2D weightSize, T config) {
102 | NnUint configSize = sizeof(T);
103 | NnByte *configCopy = new NnByte[configSize];
104 | std::memcpy(configCopy, &config, configSize);
105 | ops.push_back({
106 | code,
107 | cloneString(name),
108 | index,
109 | input,
110 | output,
111 | weightSize,
112 | configCopy,
113 | configSize
114 | });
115 | };
116 |
117 | void addSync(NnUint pipeIndex, NnSyncType syncType) {
118 | syncs.push_back({ pipeIndex, syncType });
119 | }
120 |
121 | NnSegmentConfig build() {
122 | NnSegmentConfig segment;
123 | segment.nOps = ops.size();
124 | if (segment.nOps > 0) {
125 | segment.ops = new NnOpConfig[segment.nOps];
126 | std::copy(ops.begin(), ops.end(), segment.ops);
127 | }
128 | segment.nSyncs = syncs.size();
129 | if (segment.nSyncs > 0) {
130 | segment.syncs = new NnSyncConfig[segment.nSyncs];
131 | std::copy(syncs.begin(), syncs.end(), segment.syncs);
132 | }
133 | return segment;
134 | }
135 | };
136 |
137 | #endif
--------------------------------------------------------------------------------
/src/nn/nn-core.hpp:
--------------------------------------------------------------------------------
1 | #ifndef NN_CORE_H
2 | #define NN_CORE_H
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include "nn-quants.hpp"
9 |
10 | // primitives
11 |
12 | typedef struct {
13 | NnFloatType floatType;
14 | NnUint y;
15 | NnUint x;
16 | NnSize length;
17 | NnSize nBytes;
18 | } NnSize2D;
19 |
20 | // slices
21 |
22 | typedef struct {
23 | NnUint kvDim0;
24 | NnSize2D keySize;
25 | NnSize2D valueSize;
26 | } NnKvCacheSlice;
27 |
28 | typedef struct {
29 | NnFloatType type;
30 | NnUint nNodes;
31 | NnUint d0;
32 | NnUint n;
33 | NnSize2D size;
34 | NnSize2D sliceSize;
35 | } NnRowMatmulSlice;
36 |
37 | typedef struct {
38 | NnFloatType type;
39 | NnUint nNodes;
40 | NnUint n;
41 | NnUint n0;
42 | NnUint d;
43 | NnSize2D size;
44 | NnSize2D sliceSize;
45 | } NnColMatmulSlice;
46 |
47 | typedef struct {
48 | NnUint qDim0;
49 | NnUint qDimStart;
50 | NnUint qDimEnd;
51 | NnUint qShift;
52 | NnUint kvDim;
53 | NnUint kvDim0;
54 | NnUint kvDimStart;
55 | NnUint sliceDim;
56 | NnUint seqLen;
57 | NnUint headSize;
58 | NnUint nKvHeads;
59 | float ropeTheta;
60 | NnSize2D cacheSize;
61 | } NnRopeSlice;
62 |
63 | typedef struct {
64 | NnUint nHeads;
65 | NnUint nHeads0;
66 | NnSize2D attSize;
67 | } NnMultiHeadAttSlice;
68 |
69 | // base enums
70 |
71 | enum NnOpCode {
72 | OP_MERGE_ADD,
73 | OP_EMBEDDING,
74 | OP_INV_RMS,
75 | OP_RMS_NORM,
76 | OP_MATMUL,
77 | OP_ROPE_LLAMA,
78 | OP_MULTIHEAD_ATT,
79 | OP_GELU,
80 | OP_SILU,
81 | OP_MUL,
82 | OP_CAST,
83 | OP_SHIFT,
84 | };
85 |
86 | enum NnOpQuantType {
87 | // __