├── models └── .gitignore ├── .gitmodules ├── .gitignore ├── utils.h ├── convert-pth-to-ggml.py ├── Makefile ├── README.md ├── quantize.cpp ├── utils.cpp ├── ggml.h ├── main.cpp └── server.cpp /models/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "wsServer"] 2 | path = wsServer 3 | url = https://github.com/Theldus/wsServer.git 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.a 3 | .cache/ 4 | .vs/ 5 | .vscode/ 6 | .DS_Store 7 | 8 | build/ 9 | build-em/ 10 | build-debug/ 11 | build-release/ 12 | build-static/ 13 | build-no-accel/ 14 | build-sanitize-addr/ 15 | build-sanitize-thread/ 16 | 17 | models/* 18 | 19 | /main 20 | /quantize 21 | /server 22 | 23 | arm_neon.h 24 | compile_commands.json 25 | -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | // Various helper functions and utilities 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // 12 | // CLI argument parsing 13 | // 14 | 15 | struct gpt_params { 16 | int32_t seed = -1; // RNG seed 17 | int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); 18 | int32_t n_predict = 128; // new tokens to predict 19 | 20 | // sampling parameters 21 | int32_t top_k = 40; // unused 22 | float top_p = 0.95f; 23 | float temp = 0.80f; 24 | 25 | int32_t n_batch = 8; // batch size for prompt processing 26 | 27 | std::string model = "models/lamma-7B/ggml-model.bin"; // model path 28 | std::string prompt; 29 | }; 30 | 31 | bool gpt_params_parse(int argc, char ** argv, gpt_params & params); 32 | 33 | void gpt_print_usage(int argc, char ** argv, const gpt_params & params); 34 | 35 | std::string gpt_random_prompt(std::mt19937 & rng); 36 | 37 | // 38 | // Vocab utils 39 | // 40 | 41 | struct gpt_vocab { 42 | using id = int32_t; 43 | using token = std::string; 44 | 45 | std::map token_to_id; 46 | std::map id_to_token; 47 | }; 48 | 49 | void replace(std::string & str, const std::string & needle, const std::string & replacement); 50 | 51 | // poor-man's JSON parsing 52 | std::map json_parse(const std::string & fname); 53 | 54 | // split text into tokens 55 | // 56 | // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 57 | // 58 | // Regex (Python): 59 | // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" 60 | // 61 | // Regex (C++): 62 | // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" 63 | // 64 | std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); 65 | 66 | // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. 67 | // ref: https://github.com/google/sentencepiece 68 | std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); 69 | 70 | // load the tokens from encoder.json 71 | bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); 72 | 73 | // sample next token given probabilities for each embedding 74 | // 75 | // - consider only the top K tokens 76 | // - from them, consider only the top tokens with cumulative probability > P 77 | // 78 | // TODO: not sure if this implementation is correct 79 | // TODO: temperature is not implemented 80 | // 81 | gpt_vocab::id gpt_sample_top_k_top_p( 82 | const gpt_vocab & vocab, 83 | const float * logits, 84 | int top_k, 85 | double top_p, 86 | double temp, 87 | std::mt19937 & rng); 88 | 89 | gpt_vocab::id llama_sample_top_p( 90 | const gpt_vocab & vocab, 91 | const float * logits, 92 | double top_p, 93 | double temp, 94 | std::mt19937 & rng); 95 | 96 | // 97 | // Quantization 98 | // 99 | 100 | size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); 101 | size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); 102 | -------------------------------------------------------------------------------- /convert-pth-to-ggml.py: -------------------------------------------------------------------------------- 1 | # Convert a LLaMA model checkpoint to a ggml compatible file 2 | # 3 | # Load the model using Torch 4 | # Iterate over all variables and write them to a binary file. 5 | # 6 | # For each variable, write the following: 7 | # - Number of dimensions (int) 8 | # - Name length (int) 9 | # - Dimensions (int[n_dims]) 10 | # - Name (char[name_length]) 11 | # - Data (float[n_dims]) 12 | # 13 | # By default, the bigger matrices are converted to 16-bit floats. 14 | # This can be disabled by adding the "use-f32" CLI argument. 15 | # 16 | # At the start of the ggml file we write the model parameters 17 | # and vocabulary. 18 | # 19 | 20 | import sys 21 | import json 22 | import struct 23 | import numpy as np 24 | import torch 25 | 26 | from sentencepiece import SentencePieceProcessor 27 | 28 | if len(sys.argv) < 3: 29 | print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n") 30 | print(" ftype == 0 -> float32") 31 | print(" ftype == 1 -> float16") 32 | sys.exit(1) 33 | 34 | # output in the same directory as the model 35 | dir_model = sys.argv[1] 36 | 37 | fname_hparams = sys.argv[1] + "/params.json" 38 | fname_tokenizer = sys.argv[1] + "/../tokenizer.model" 39 | 40 | def get_n_parts(dim): 41 | if dim == 4096: 42 | return 1 43 | elif dim == 5120: 44 | return 2 45 | elif dim == 6656: 46 | return 4 47 | elif dim == 8192: 48 | return 8 49 | else: 50 | print("Invalid dim: " + str(dim)) 51 | sys.exit(1) 52 | 53 | # possible data types 54 | # ftype == 0 -> float32 55 | # ftype == 1 -> float16 56 | # 57 | # map from ftype to string 58 | ftype_str = ["f32", "f16"] 59 | 60 | ftype = 1 61 | if len(sys.argv) > 2: 62 | ftype = int(sys.argv[2]) 63 | if ftype < 0 or ftype > 1: 64 | print("Invalid ftype: " + str(ftype)) 65 | sys.exit(1) 66 | fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" 67 | 68 | with open(fname_hparams, "r") as f: 69 | hparams = json.load(f) 70 | 71 | tokenizer = SentencePieceProcessor(fname_tokenizer) 72 | 73 | hparams.update({"vocab_size": tokenizer.vocab_size()}) 74 | 75 | n_parts = get_n_parts(hparams["dim"]) 76 | 77 | print(hparams) 78 | print('n_parts = ', n_parts) 79 | 80 | for p in range(n_parts): 81 | print('Processing part ', p) 82 | 83 | #fname_model = sys.argv[1] + "/consolidated.00.pth" 84 | fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth" 85 | fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" 86 | if (p > 0): 87 | fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p) 88 | 89 | model = torch.load(fname_model, map_location="cpu") 90 | 91 | fout = open(fname_out, "wb") 92 | 93 | fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex 94 | fout.write(struct.pack("i", hparams["vocab_size"])) 95 | fout.write(struct.pack("i", hparams["dim"])) 96 | fout.write(struct.pack("i", hparams["multiple_of"])) 97 | fout.write(struct.pack("i", hparams["n_heads"])) 98 | fout.write(struct.pack("i", hparams["n_layers"])) 99 | fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete) 100 | fout.write(struct.pack("i", ftype)) 101 | 102 | # Is this correct?? 103 | for i in range(32000): 104 | # TODO: this is probably wrong - not sure how this tokenizer works 105 | text = tokenizer.decode([29889, i]).encode('utf-8') 106 | # remove the first byte (it's always '.') 107 | text = text[1:] 108 | fout.write(struct.pack("i", len(text))) 109 | fout.write(text) 110 | 111 | for k, v in model.items(): 112 | name = k 113 | shape = v.shape 114 | 115 | # skip layers.X.attention.inner_attention.rope.freqs 116 | if name[-5:] == "freqs": 117 | continue 118 | 119 | print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype) 120 | 121 | #data = tf.train.load_variable(dir_model, name).squeeze() 122 | data = v.numpy().squeeze() 123 | n_dims = len(data.shape); 124 | 125 | # for efficiency - transpose some matrices 126 | # "model/h.*/attn/c_attn/w" 127 | # "model/h.*/attn/c_proj/w" 128 | # "model/h.*/mlp/c_fc/w" 129 | # "model/h.*/mlp/c_proj/w" 130 | #if name[-14:] == "/attn/c_attn/w" or \ 131 | # name[-14:] == "/attn/c_proj/w" or \ 132 | # name[-11:] == "/mlp/c_fc/w" or \ 133 | # name[-13:] == "/mlp/c_proj/w": 134 | # print(" Transposing") 135 | # data = data.transpose() 136 | 137 | dshape = data.shape 138 | 139 | # default type is fp16 140 | ftype_cur = 1 141 | if ftype == 0 or n_dims == 1: 142 | print(" Converting to float32") 143 | data = data.astype(np.float32) 144 | ftype_cur = 0 145 | 146 | # header 147 | sname = name.encode('utf-8') 148 | fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur)) 149 | for i in range(n_dims): 150 | fout.write(struct.pack("i", dshape[n_dims - 1 - i])) 151 | fout.write(sname); 152 | 153 | # data 154 | data.tofile(fout) 155 | 156 | # I hope this deallocates the memory .. 157 | model = None 158 | 159 | fout.close() 160 | 161 | print("Done. Output file: " + fname_out + ", (part ", p, ")") 162 | print("") 163 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ifndef UNAME_S 2 | UNAME_S := $(shell uname -s) 3 | endif 4 | 5 | ifndef UNAME_P 6 | UNAME_P := $(shell uname -p) 7 | endif 8 | 9 | ifndef UNAME_M 10 | UNAME_M := $(shell uname -m) 11 | endif 12 | 13 | CCV := $(shell $(CC) --version | head -n 1) 14 | CXXV := $(shell $(CXX) --version | head -n 1) 15 | 16 | # Mac OS + Arm can report x86_64 17 | # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 18 | ifeq ($(UNAME_S),Darwin) 19 | ifneq ($(UNAME_P),arm) 20 | SYSCTL_M := $(shell sysctl -n hw.optional.arm64) 21 | ifeq ($(SYSCTL_M),1) 22 | # UNAME_P := arm 23 | # UNAME_M := arm64 24 | warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789) 25 | endif 26 | endif 27 | endif 28 | 29 | # 30 | # Compile flags 31 | # 32 | 33 | CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC 34 | CXXFLAGS = -I. -I./wsServer/include -I./examples -O3 -DNDEBUG -std=c++11 -fPIC 35 | LDFLAGS = 36 | 37 | # OS specific 38 | # TODO: support Windows 39 | ifeq ($(UNAME_S),Linux) 40 | CFLAGS += -pthread 41 | CXXFLAGS += -pthread 42 | endif 43 | ifeq ($(UNAME_S),Darwin) 44 | CFLAGS += -pthread 45 | CXXFLAGS += -pthread 46 | endif 47 | ifeq ($(UNAME_S),FreeBSD) 48 | CFLAGS += -pthread 49 | CXXFLAGS += -pthread 50 | endif 51 | ifeq ($(UNAME_S),Haiku) 52 | CFLAGS += -pthread 53 | CXXFLAGS += -pthread 54 | endif 55 | 56 | # Architecture specific 57 | # TODO: probably these flags need to be tweaked on some architectures 58 | # feel free to update the Makefile for your architecture and send a pull request or issue 59 | ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) 60 | ifeq ($(UNAME_S),Darwin) 61 | CFLAGS += -mf16c 62 | AVX1_M := $(shell sysctl machdep.cpu.features) 63 | ifneq (,$(findstring FMA,$(AVX1_M))) 64 | CFLAGS += -mfma 65 | endif 66 | ifneq (,$(findstring AVX1.0,$(AVX1_M))) 67 | CFLAGS += -mavx 68 | endif 69 | AVX2_M := $(shell sysctl machdep.cpu.leaf7_features) 70 | ifneq (,$(findstring AVX2,$(AVX2_M))) 71 | CFLAGS += -mavx2 72 | endif 73 | else ifeq ($(UNAME_S),Linux) 74 | AVX1_M := $(shell grep "avx " /proc/cpuinfo) 75 | ifneq (,$(findstring avx,$(AVX1_M))) 76 | CFLAGS += -mavx 77 | endif 78 | AVX2_M := $(shell grep "avx2 " /proc/cpuinfo) 79 | ifneq (,$(findstring avx2,$(AVX2_M))) 80 | CFLAGS += -mavx2 81 | endif 82 | FMA_M := $(shell grep "fma " /proc/cpuinfo) 83 | ifneq (,$(findstring fma,$(FMA_M))) 84 | CFLAGS += -mfma 85 | endif 86 | F16C_M := $(shell grep "f16c " /proc/cpuinfo) 87 | ifneq (,$(findstring f16c,$(F16C_M))) 88 | CFLAGS += -mf16c 89 | endif 90 | SSE3_M := $(shell grep "sse3 " /proc/cpuinfo) 91 | ifneq (,$(findstring sse3,$(SSE3_M))) 92 | CFLAGS += -msse3 93 | endif 94 | else ifeq ($(UNAME_S),Haiku) 95 | AVX1_M := $(shell sysinfo -cpu | grep "AVX ") 96 | ifneq (,$(findstring avx,$(AVX1_M))) 97 | CFLAGS += -mavx 98 | endif 99 | AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ") 100 | ifneq (,$(findstring avx2,$(AVX2_M))) 101 | CFLAGS += -mavx2 102 | endif 103 | FMA_M := $(shell sysinfo -cpu | grep "FMA ") 104 | ifneq (,$(findstring fma,$(FMA_M))) 105 | CFLAGS += -mfma 106 | endif 107 | F16C_M := $(shell sysinfo -cpu | grep "F16C ") 108 | ifneq (,$(findstring f16c,$(F16C_M))) 109 | CFLAGS += -mf16c 110 | endif 111 | else 112 | CFLAGS += -mfma -mf16c -mavx -mavx2 113 | endif 114 | endif 115 | ifeq ($(UNAME_M),amd64) 116 | CFLAGS += -mavx -mavx2 -mfma -mf16c 117 | endif 118 | ifneq ($(filter ppc64%,$(UNAME_M)),) 119 | POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) 120 | ifneq (,$(findstring POWER9,$(POWER9_M))) 121 | CFLAGS += -mpower9-vector 122 | endif 123 | # Require c++23's std::byteswap for big-endian support. 124 | ifeq ($(UNAME_M),ppc64) 125 | CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN 126 | endif 127 | endif 128 | ifndef LLAMA_NO_ACCELERATE 129 | # Mac M1 - include Accelerate framework 130 | ifeq ($(UNAME_S),Darwin) 131 | CFLAGS += -DGGML_USE_ACCELERATE 132 | LDFLAGS += -framework Accelerate 133 | endif 134 | endif 135 | ifdef LLAMA_OPENBLAS 136 | CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas 137 | LDFLAGS += -lopenblas 138 | endif 139 | ifdef LLAMA_GPROF 140 | CFLAGS += -pg 141 | CXXFLAGS += -pg 142 | endif 143 | ifneq ($(filter aarch64%,$(UNAME_M)),) 144 | CFLAGS += -mcpu=native 145 | CXXFLAGS += -mcpu=native 146 | endif 147 | ifneq ($(filter armv6%,$(UNAME_M)),) 148 | # Raspberry Pi 1, 2, 3 149 | CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access 150 | endif 151 | ifneq ($(filter armv7%,$(UNAME_M)),) 152 | # Raspberry Pi 4 153 | CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations 154 | endif 155 | ifneq ($(filter armv8%,$(UNAME_M)),) 156 | # Raspberry Pi 4 157 | CFLAGS += -mfp16-format=ieee -mno-unaligned-access 158 | endif 159 | 160 | # 161 | # Print build information 162 | # 163 | 164 | $(info I llama.cpp build info: ) 165 | $(info I UNAME_S: $(UNAME_S)) 166 | $(info I UNAME_P: $(UNAME_P)) 167 | $(info I UNAME_M: $(UNAME_M)) 168 | $(info I CFLAGS: $(CFLAGS)) 169 | $(info I CXXFLAGS: $(CXXFLAGS)) 170 | $(info I LDFLAGS: $(LDFLAGS)) 171 | $(info I CC: $(CCV)) 172 | $(info I CXX: $(CXXV)) 173 | $(info ) 174 | 175 | default: main quantize server 176 | 177 | # 178 | # Build library 179 | # 180 | 181 | ggml.o: ggml.c ggml.h 182 | $(CC) $(CFLAGS) -c ggml.c -o ggml.o 183 | 184 | utils.o: utils.cpp utils.h 185 | $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o 186 | 187 | clean: 188 | rm -f *.o main quantize server 189 | 190 | main: main.cpp ggml.o utils.o 191 | $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS) 192 | ./main -h 193 | 194 | wsServer/libws.a: 195 | git submodule update --init --recursive 196 | cd wsServer && make 197 | 198 | server: server.cpp ggml.o utils.o wsServer/libws.a 199 | $(CXX) $(CXXFLAGS) server.cpp ggml.o utils.o wsServer/libws.a -o server $(LDFLAGS) -liconv 200 | ./server -h 201 | 202 | quantize: quantize.cpp ggml.o utils.o 203 | $(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) 204 | 205 | # 206 | # Tests 207 | # 208 | 209 | .PHONY: tests 210 | tests: 211 | bash ./tests/run-tests.sh 212 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # llama.cpp 2 | 3 | Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in pure C/C++ 4 | 5 | **Hot topics** 6 | 7 | - Running on Windows: https://github.com/ggerganov/llama.cpp/issues/22 8 | 9 | ## Description 10 | 11 | The main goal is to run the model using 4-bit quantization on a MacBook. 12 | 13 | - Plain C/C++ implementation without dependencies 14 | - Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework 15 | - AVX2 support for x86 architectures 16 | - Mixed F16 / F32 precision 17 | - 4-bit quantization support 18 | - Runs on the CPU 19 | 20 | This was hacked in an evening - I have no idea if it works correctly. 21 | Please do not make conclusions about the models based on the results from this implementation. 22 | For all I know, it can be completely wrong. This project is for educational purposes and is not going to be maintained properly. 23 | New features will probably be added mostly through community contributions, if any. 24 | 25 | --- 26 | 27 | Here is a typical run using LLaMA-7B: 28 | 29 | ```java 30 | make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 31 | I llama.cpp build info: 32 | I UNAME_S: Darwin 33 | I UNAME_P: arm 34 | I UNAME_M: arm64 35 | I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -DGGML_USE_ACCELERATE 36 | I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread 37 | I LDFLAGS: -framework Accelerate 38 | I CC: Apple clang version 14.0.0 (clang-1400.0.29.202) 39 | I CXX: Apple clang version 14.0.0 (clang-1400.0.29.202) 40 | 41 | make: Nothing to be done for `default'. 42 | main: seed = 1678486056 43 | llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ... 44 | llama_model_load: n_vocab = 32000 45 | llama_model_load: n_ctx = 512 46 | llama_model_load: n_embd = 4096 47 | llama_model_load: n_mult = 256 48 | llama_model_load: n_head = 32 49 | llama_model_load: n_layer = 32 50 | llama_model_load: n_rot = 128 51 | llama_model_load: f16 = 2 52 | llama_model_load: n_ff = 11008 53 | llama_model_load: ggml ctx size = 4529.34 MB 54 | llama_model_load: memory_size = 512.00 MB, n_mem = 16384 55 | llama_model_load: .................................... done 56 | llama_model_load: model size = 4017.27 MB / num tensors = 291 57 | 58 | main: prompt: 'Building a website can be done in 10 simple steps:' 59 | main: number of tokens in prompt = 15 60 | 1 -> '' 61 | 8893 -> 'Build' 62 | 292 -> 'ing' 63 | 263 -> ' a' 64 | 4700 -> ' website' 65 | 508 -> ' can' 66 | 367 -> ' be' 67 | 2309 -> ' done' 68 | 297 -> ' in' 69 | 29871 -> ' ' 70 | 29896 -> '1' 71 | 29900 -> '0' 72 | 2560 -> ' simple' 73 | 6576 -> ' steps' 74 | 29901 -> ':' 75 | 76 | sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000 77 | 78 | 79 | Building a website can be done in 10 simple steps: 80 | 1) Select a domain name and web hosting plan 81 | 2) Complete a sitemap 82 | 3) List your products 83 | 4) Write product descriptions 84 | 5) Create a user account 85 | 6) Build the template 86 | 7) Start building the website 87 | 8) Advertise the website 88 | 9) Provide email support 89 | 10) Submit the website to search engines 90 | A website is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves. 91 | The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user's browser. 92 | The web pages are stored in a web server. The web server is also called a host. When the website is accessed, it is retrieved from the server and displayed on the user's computer. 93 | A website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server. 94 | A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user's screen. 95 | A website can also be viewed on different devices such as desktops, tablets and smartphones. 96 | Hence, to have a website displayed on a browser, the website must be hosted. 97 | A domain name is an address of a website. It is the name of the website. 98 | The website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server. 99 | A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user’s screen. 100 | A website can also be viewed on different devices such as desktops, tablets and smartphones. Hence, to have a website displayed on a browser, the website must be hosted. 101 | A domain name is an address of a website. It is the name of the website. 102 | A website is an address of a website. It is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves. 103 | The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user’s browser. 104 | A website is known as a website when it is hosted 105 | 106 | main: mem per token = 14434244 bytes 107 | main: load time = 1332.48 ms 108 | main: sample time = 1081.40 ms 109 | main: predict time = 31378.77 ms / 61.41 ms per token 110 | main: total time = 34036.74 ms 111 | ``` 112 | 113 | And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook: 114 | 115 | https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4 116 | 117 | ## Usage 118 | 119 | Here are the step for the LLaMA-7B model: 120 | 121 | ```bash 122 | # build this repo 123 | git clone https://github.com/ggerganov/llama.cpp 124 | cd llama.cpp 125 | make 126 | 127 | # obtain the original LLaMA model weights and place them in ./models 128 | ls ./models 129 | 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model 130 | 131 | # install Python dependencies 132 | python3 -m pip install torch numpy sentencepiece 133 | 134 | # convert the 7B model to ggml FP16 format 135 | python3 convert-pth-to-ggml.py models/7B/ 1 136 | 137 | # quantize the model to 4-bits 138 | ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2 139 | 140 | # run the inference 141 | ./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128 142 | ``` 143 | 144 | For the bigger models, there are a few extra quantization steps. For example, for LLaMA-13B, converting to FP16 format 145 | will create 2 ggml files, instead of one: 146 | 147 | ```bash 148 | ggml-model-f16.bin 149 | ggml-model-f16.bin.1 150 | ``` 151 | 152 | You need to quantize each of them separately like this: 153 | 154 | ```bash 155 | ./quantize ./models/13B/ggml-model-f16.bin ./models/13B/ggml-model-q4_0.bin 2 156 | ./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2 157 | ``` 158 | 159 | Everything else is the same. Simply run: 160 | 161 | ```bash 162 | ./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128 163 | ``` 164 | 165 | The number of files generated for each model is as follows: 166 | 167 | ``` 168 | 7B -> 1 file 169 | 13B -> 2 files 170 | 30B -> 4 files 171 | 65B -> 8 files 172 | ``` 173 | 174 | When running the larger models, make sure you have enough disk space to store all the intermediate files. 175 | 176 | ## Limitations 177 | 178 | - Not sure if my tokenizer is correct. There are a few places where we might have a mistake: 179 | - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87 180 | - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69 181 | In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that 182 | - I don't know yet how much the quantization affects the quality of the generated text 183 | - Probably the token sampling can be improved 184 | - The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder, 185 | there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simlpy don't 186 | know how to utilize it properly. But in any case, you can even disable it with `LLAMA_NO_ACCELERATE=1 make` and the 187 | performance will be the same, since no BLAS calls are invoked by the current implementation 188 | 189 | -------------------------------------------------------------------------------- /quantize.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | 3 | #include "utils.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | // TODO: move somewhere else 16 | #define QK 32 17 | 18 | // default hparams (LLaMA76B) 19 | struct llama_hparams { 20 | int32_t n_vocab = 32000; 21 | int32_t n_ctx = 512; // this is provided as user input? 22 | int32_t n_embd = 4096; 23 | int32_t n_mult = 256; 24 | int32_t n_head = 32; 25 | int32_t n_layer = 32; 26 | int32_t n_rot = 64; 27 | int32_t f16 = 1; 28 | }; 29 | 30 | 31 | // quantize a model 32 | bool llama_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) { 33 | ggml_type type = GGML_TYPE_Q4_1; 34 | 35 | switch (itype) { 36 | case 2: type = GGML_TYPE_Q4_0; break; 37 | case 3: type = GGML_TYPE_Q4_1; break; 38 | default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; 39 | }; 40 | 41 | if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { 42 | fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); 43 | return false; 44 | } 45 | 46 | gpt_vocab vocab; 47 | 48 | printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); 49 | 50 | auto finp = std::ifstream(fname_inp, std::ios::binary); 51 | if (!finp) { 52 | fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); 53 | return false; 54 | } 55 | 56 | auto fout = std::ofstream(fname_out, std::ios::binary); 57 | if (!fout) { 58 | fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); 59 | return false; 60 | } 61 | 62 | // verify magic 63 | { 64 | uint32_t magic; 65 | finp.read((char *) &magic, sizeof(magic)); 66 | if (magic != 0x67676d6c) { 67 | fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); 68 | return false; 69 | } 70 | 71 | fout.write((char *) &magic, sizeof(magic)); 72 | } 73 | 74 | llama_hparams hparams; 75 | 76 | // load hparams 77 | { 78 | finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); 79 | //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); 80 | finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); 81 | finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); 82 | finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); 83 | finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); 84 | finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); 85 | finp.read((char *) &hparams.f16, sizeof(hparams.f16)); 86 | 87 | printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); 88 | printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); 89 | printf("%s: n_embd = %d\n", __func__, hparams.n_embd); 90 | printf("%s: n_mult = %d\n", __func__, hparams.n_mult); 91 | printf("%s: n_head = %d\n", __func__, hparams.n_head); 92 | printf("%s: n_layer = %d\n", __func__, hparams.n_layer); 93 | printf("%s: f16 = %d\n", __func__, hparams.f16); 94 | 95 | fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); 96 | //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); 97 | fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); 98 | fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult)); 99 | fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); 100 | fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); 101 | fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); 102 | fout.write((char *) &itype, sizeof(hparams.f16)); 103 | } 104 | 105 | // load vocab 106 | { 107 | const int32_t n_vocab = hparams.n_vocab; 108 | 109 | if (n_vocab != hparams.n_vocab) { 110 | fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", 111 | __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab); 112 | return false; 113 | } 114 | 115 | std::string word; 116 | for (int i = 0; i < n_vocab; i++) { 117 | uint32_t len; 118 | finp.read ((char *) &len, sizeof(len)); 119 | fout.write((char *) &len, sizeof(len)); 120 | 121 | word.resize(len); 122 | finp.read ((char *) word.data(), len); 123 | fout.write((char *) word.data(), len); 124 | 125 | vocab.token_to_id[word] = i; 126 | vocab.id_to_token[i] = word; 127 | } 128 | } 129 | 130 | // load weights 131 | { 132 | size_t total_size_org = 0; 133 | size_t total_size_new = 0; 134 | 135 | std::vector work; 136 | 137 | std::vector data_u8; 138 | std::vector data_f16; 139 | std::vector data_f32; 140 | 141 | std::vector hist_all(1 << 4, 0); 142 | 143 | while (true) { 144 | int32_t n_dims; 145 | int32_t length; 146 | int32_t ftype; 147 | 148 | finp.read(reinterpret_cast(&n_dims), sizeof(n_dims)); 149 | finp.read(reinterpret_cast(&length), sizeof(length)); 150 | finp.read(reinterpret_cast(&ftype), sizeof(ftype)); 151 | 152 | if (finp.eof()) { 153 | break; 154 | } 155 | 156 | int32_t nelements = 1; 157 | int32_t ne[2] = { 1, 1 }; 158 | for (int i = 0; i < n_dims; ++i) { 159 | finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); 160 | nelements *= ne[i]; 161 | } 162 | 163 | std::string name(length, 0); 164 | finp.read (&name[0], length); 165 | 166 | { 167 | static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; 168 | printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); 169 | } 170 | 171 | // regexes of tensor names to be quantized 172 | const std::vector k_names = { 173 | ".*weight", 174 | }; 175 | 176 | bool quantize = false; 177 | for (const auto & s : k_names) { 178 | if (std::regex_match(name, std::regex(s))) { 179 | quantize = true; 180 | break; 181 | } 182 | } 183 | 184 | // quantize only 2D tensors 185 | quantize &= (n_dims == 2); 186 | 187 | if (quantize) { 188 | if (ftype != 0 && ftype != 1) { 189 | fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); 190 | return false; 191 | } 192 | 193 | if (ftype == 1) { 194 | data_f16.resize(nelements); 195 | finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); 196 | data_f32.resize(nelements); 197 | for (int i = 0; i < nelements; ++i) { 198 | data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); 199 | } 200 | } else { 201 | data_f32.resize(nelements); 202 | finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); 203 | } 204 | 205 | ftype = itype; 206 | } else { 207 | const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t); 208 | 209 | data_u8.resize(nelements*bpe); 210 | finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); 211 | } 212 | 213 | fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); 214 | fout.write(reinterpret_cast(&length), sizeof(length)); 215 | fout.write(reinterpret_cast(&ftype), sizeof(ftype)); 216 | for (int i = 0; i < n_dims; ++i) { 217 | fout.write(reinterpret_cast(&ne[i]), sizeof(ne[i])); 218 | } 219 | fout.write(&name[0], length); 220 | 221 | if (quantize) { 222 | printf("quantizing .. "); 223 | work.resize(nelements); // for quantization 224 | 225 | size_t cur_size = 0; 226 | std::vector hist_cur(1 << 4, 0); 227 | 228 | switch (type) { 229 | case GGML_TYPE_Q4_0: 230 | { 231 | cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data()); 232 | } break; 233 | case GGML_TYPE_Q4_1: 234 | { 235 | cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data()); 236 | } break; 237 | default: 238 | { 239 | fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type); 240 | return false; 241 | } 242 | } 243 | 244 | fout.write(reinterpret_cast(work.data()), cur_size); 245 | total_size_new += cur_size; 246 | 247 | printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); 248 | for (int i = 0; i < hist_cur.size(); ++i) { 249 | hist_all[i] += hist_cur[i]; 250 | } 251 | 252 | for (int i = 0; i < hist_cur.size(); ++i) { 253 | printf("%5.3f ", hist_cur[i] / (float)nelements); 254 | } 255 | printf("\n"); 256 | } else { 257 | printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); 258 | fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); 259 | total_size_new += data_u8.size(); 260 | } 261 | 262 | total_size_org += nelements * sizeof(float); 263 | } 264 | 265 | printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); 266 | printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); 267 | 268 | { 269 | int64_t sum_all = 0; 270 | for (int i = 0; i < hist_all.size(); ++i) { 271 | sum_all += hist_all[i]; 272 | } 273 | 274 | printf("%s: hist: ", __func__); 275 | for (int i = 0; i < hist_all.size(); ++i) { 276 | printf("%5.3f ", hist_all[i] / (float)sum_all); 277 | } 278 | printf("\n"); 279 | } 280 | } 281 | 282 | finp.close(); 283 | fout.close(); 284 | 285 | return true; 286 | } 287 | 288 | // usage: 289 | // ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type 290 | // 291 | int main(int argc, char ** argv) { 292 | if (argc != 4) { 293 | fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); 294 | fprintf(stderr, " type = 2 - q4_0\n"); 295 | fprintf(stderr, " type = 3 - q4_1\n"); 296 | return 1; 297 | } 298 | 299 | // needed to initialize f16 tables 300 | { 301 | struct ggml_init_params params = { 0, NULL }; 302 | struct ggml_context * ctx = ggml_init(params); 303 | ggml_free(ctx); 304 | } 305 | 306 | const std::string fname_inp = argv[1]; 307 | const std::string fname_out = argv[2]; 308 | 309 | const int itype = atoi(argv[3]); 310 | 311 | const int64_t t_main_start_us = ggml_time_us(); 312 | 313 | int64_t t_quantize_us = 0; 314 | 315 | // load the model 316 | { 317 | const int64_t t_start_us = ggml_time_us(); 318 | 319 | if (!llama_model_quantize(fname_inp, fname_out, itype)) { 320 | fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); 321 | return 1; 322 | } 323 | 324 | t_quantize_us = ggml_time_us() - t_start_us; 325 | } 326 | 327 | // report timing 328 | { 329 | const int64_t t_main_end_us = ggml_time_us(); 330 | 331 | printf("\n"); 332 | printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); 333 | printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); 334 | } 335 | 336 | return 0; 337 | } 338 | -------------------------------------------------------------------------------- /utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { 9 | for (int i = 1; i < argc; i++) { 10 | std::string arg = argv[i]; 11 | 12 | if (arg == "-s" || arg == "--seed") { 13 | params.seed = std::stoi(argv[++i]); 14 | } else if (arg == "-t" || arg == "--threads") { 15 | params.n_threads = std::stoi(argv[++i]); 16 | } else if (arg == "-p" || arg == "--prompt") { 17 | params.prompt = argv[++i]; 18 | } else if (arg == "-n" || arg == "--n_predict") { 19 | params.n_predict = std::stoi(argv[++i]); 20 | } else if (arg == "--top_k") { 21 | params.top_k = std::stoi(argv[++i]); 22 | } else if (arg == "--top_p") { 23 | params.top_p = std::stof(argv[++i]); 24 | } else if (arg == "--temp") { 25 | params.temp = std::stof(argv[++i]); 26 | } else if (arg == "-b" || arg == "--batch_size") { 27 | params.n_batch = std::stoi(argv[++i]); 28 | } else if (arg == "-m" || arg == "--model") { 29 | params.model = argv[++i]; 30 | } else if (arg == "-h" || arg == "--help") { 31 | gpt_print_usage(argc, argv, params); 32 | exit(0); 33 | } else { 34 | fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); 35 | gpt_print_usage(argc, argv, params); 36 | exit(0); 37 | } 38 | } 39 | 40 | return true; 41 | } 42 | 43 | void gpt_print_usage(int argc, char ** argv, const gpt_params & params) { 44 | fprintf(stderr, "usage: %s [options]\n", argv[0]); 45 | fprintf(stderr, "\n"); 46 | fprintf(stderr, "options:\n"); 47 | fprintf(stderr, " -h, --help show this help message and exit\n"); 48 | fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); 49 | fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); 50 | fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); 51 | fprintf(stderr, " prompt to start generation with (default: random)\n"); 52 | fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); 53 | fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); 54 | fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); 55 | fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); 56 | fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); 57 | fprintf(stderr, " -m FNAME, --model FNAME\n"); 58 | fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); 59 | fprintf(stderr, "\n"); 60 | } 61 | 62 | std::string gpt_random_prompt(std::mt19937 & rng) { 63 | const int r = rng() % 10; 64 | switch (r) { 65 | case 0: return "So"; 66 | case 1: return "Once upon a time"; 67 | case 2: return "When"; 68 | case 3: return "The"; 69 | case 4: return "After"; 70 | case 5: return "If"; 71 | case 6: return "import"; 72 | case 7: return "He"; 73 | case 8: return "She"; 74 | case 9: return "They"; 75 | default: return "To"; 76 | } 77 | 78 | return "The"; 79 | } 80 | 81 | void replace(std::string & str, const std::string & needle, const std::string & replacement) { 82 | size_t pos = 0; 83 | while ((pos = str.find(needle, pos)) != std::string::npos) { 84 | str.replace(pos, needle.length(), replacement); 85 | pos += replacement.length(); 86 | } 87 | } 88 | 89 | std::map json_parse(const std::string & fname) { 90 | std::map result; 91 | 92 | // read file into string 93 | std::string json; 94 | { 95 | std::ifstream ifs(fname); 96 | if (!ifs) { 97 | fprintf(stderr, "Failed to open %s\n", fname.c_str()); 98 | exit(1); 99 | } 100 | 101 | json = std::string((std::istreambuf_iterator(ifs)), 102 | (std::istreambuf_iterator())); 103 | } 104 | 105 | if (json[0] != '{') { 106 | return result; 107 | } 108 | 109 | // parse json 110 | { 111 | bool has_key = false; 112 | bool in_token = false; 113 | 114 | std::string str_key = ""; 115 | std::string str_val = ""; 116 | 117 | int n = json.size(); 118 | for (int i = 1; i < n; ++i) { 119 | if (!in_token) { 120 | if (json[i] == ' ') continue; 121 | if (json[i] == '"') { 122 | in_token = true; 123 | continue; 124 | } 125 | } else { 126 | if (json[i] == '\\' && i+1 < n) { 127 | if (has_key == false) { 128 | str_key += json[i]; 129 | } else { 130 | str_val += json[i]; 131 | } 132 | ++i; 133 | } else if (json[i] == '"') { 134 | if (has_key == false) { 135 | has_key = true; 136 | ++i; 137 | while (json[i] == ' ') ++i; 138 | ++i; // : 139 | while (json[i] == ' ') ++i; 140 | if (json[i] != '\"') { 141 | while (json[i] != ',' && json[i] != '}') { 142 | str_val += json[i++]; 143 | } 144 | has_key = false; 145 | } else { 146 | in_token = true; 147 | continue; 148 | } 149 | } else { 150 | has_key = false; 151 | } 152 | 153 | ::replace(str_key, "\\u0120", " " ); // \u0120 -> space 154 | ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line 155 | ::replace(str_key, "\\\"", "\""); // \\\" -> " 156 | 157 | try { 158 | result[str_key] = std::stoi(str_val); 159 | } catch (...) { 160 | //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str()); 161 | 162 | } 163 | str_key = ""; 164 | str_val = ""; 165 | in_token = false; 166 | continue; 167 | } 168 | if (has_key == false) { 169 | str_key += json[i]; 170 | } else { 171 | str_val += json[i]; 172 | } 173 | } 174 | } 175 | } 176 | 177 | return result; 178 | } 179 | 180 | std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { 181 | std::vector words; 182 | 183 | // first split the text into words 184 | { 185 | std::string str = text; 186 | std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; 187 | 188 | std::regex re(pat); 189 | std::smatch m; 190 | 191 | while (std::regex_search(str, m, re)) { 192 | for (auto x : m) { 193 | words.push_back(x); 194 | } 195 | str = m.suffix(); 196 | } 197 | } 198 | 199 | // find the longest tokens that form the words: 200 | std::vector tokens; 201 | for (const auto & word : words) { 202 | if (word.size() == 0) continue; 203 | 204 | int i = 0; 205 | int n = word.size(); 206 | while (i < n) { 207 | int j = n; 208 | while (j > i) { 209 | auto it = vocab.token_to_id.find(word.substr(i, j-i)); 210 | if (it != vocab.token_to_id.end()) { 211 | tokens.push_back(it->second); 212 | i = j; 213 | break; 214 | } 215 | --j; 216 | } 217 | if (i == n) { 218 | break; 219 | } 220 | if (j == i) { 221 | auto sub = word.substr(i, 1); 222 | if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) { 223 | tokens.push_back(vocab.token_to_id.at(sub)); 224 | } else { 225 | fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data()); 226 | } 227 | ++i; 228 | } 229 | } 230 | } 231 | 232 | return tokens; 233 | } 234 | 235 | std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { 236 | //auto res = gpt_tokenize(vocab, text); 237 | 238 | //if (bos) { 239 | // res.insert(res.begin(), 1); // TODO: replace with vocab.bos 240 | //} 241 | 242 | std::vector res; 243 | 244 | if (bos) { 245 | res.push_back(1); // TODO: replace with vocab.bos 246 | } 247 | 248 | //find the longest token that matches the text 249 | int pos = 0; 250 | while (true) { 251 | int l = 0; 252 | int t = 0; 253 | for (const auto & kv : vocab.id_to_token) { 254 | if (kv.second.size() < l) continue; 255 | if (kv.second.size() > text.size() - pos) continue; 256 | if (text.substr(pos, kv.second.size()) == kv.second) { 257 | l = kv.second.size(); 258 | t = kv.first; 259 | } 260 | } 261 | 262 | if (l == 0) { 263 | break; 264 | } 265 | 266 | res.push_back(t); 267 | pos += l; 268 | } 269 | 270 | return res; 271 | } 272 | 273 | bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { 274 | printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); 275 | 276 | vocab.token_to_id = ::json_parse(fname); 277 | 278 | for (const auto & kv : vocab.token_to_id) { 279 | vocab.id_to_token[kv.second] = kv.first; 280 | } 281 | 282 | printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size()); 283 | 284 | // print the vocabulary 285 | //for (auto kv : vocab.token_to_id) { 286 | // printf("'%s' -> %d\n", kv.first.data(), kv.second); 287 | //} 288 | 289 | return true; 290 | } 291 | 292 | gpt_vocab::id gpt_sample_top_k_top_p( 293 | const gpt_vocab & vocab, 294 | const float * logits, 295 | int top_k, 296 | double top_p, 297 | double temp, 298 | std::mt19937 & rng) { 299 | int n_logits = vocab.id_to_token.size(); 300 | 301 | std::vector> logits_id; 302 | logits_id.reserve(n_logits); 303 | 304 | { 305 | const double scale = 1.0/temp; 306 | for (int i = 0; i < n_logits; ++i) { 307 | logits_id.push_back(std::make_pair(logits[i]*scale, i)); 308 | } 309 | } 310 | 311 | // find the top K tokens 312 | std::partial_sort( 313 | logits_id.begin(), 314 | logits_id.begin() + top_k, logits_id.end(), 315 | [](const std::pair & a, const std::pair & b) { 316 | return a.first > b.first; 317 | }); 318 | 319 | logits_id.resize(top_k); 320 | 321 | double maxl = -INFINITY; 322 | for (const auto & kv : logits_id) { 323 | maxl = std::max(maxl, kv.first); 324 | } 325 | 326 | // compute probs for the top K tokens 327 | std::vector probs; 328 | probs.reserve(logits_id.size()); 329 | 330 | double sum = 0.0; 331 | for (const auto & kv : logits_id) { 332 | double p = exp(kv.first - maxl); 333 | probs.push_back(p); 334 | sum += p; 335 | } 336 | 337 | // normalize the probs 338 | for (auto & p : probs) { 339 | p /= sum; 340 | } 341 | 342 | if (top_p < 1.0f) { 343 | double cumsum = 0.0f; 344 | for (int i = 0; i < top_k; i++) { 345 | cumsum += probs[i]; 346 | if (cumsum >= top_p) { 347 | top_k = i + 1; 348 | probs.resize(top_k); 349 | logits_id.resize(top_k); 350 | break; 351 | } 352 | } 353 | 354 | cumsum = 1.0/cumsum; 355 | for (int i = 0; i < (int) probs.size(); i++) { 356 | probs[i] *= cumsum; 357 | } 358 | } 359 | 360 | //printf("\n"); 361 | //for (int i = 0; i < (int) probs.size(); i++) { 362 | // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); 363 | //} 364 | //exit(0); 365 | 366 | std::discrete_distribution<> dist(probs.begin(), probs.end()); 367 | int idx = dist(rng); 368 | 369 | return logits_id[idx].second; 370 | } 371 | 372 | gpt_vocab::id llama_sample_top_p( 373 | const gpt_vocab & vocab, 374 | const float * logits, 375 | double top_p, 376 | double temp, 377 | std::mt19937 & rng) { 378 | int n_logits = vocab.id_to_token.size(); 379 | 380 | std::vector> logits_id; 381 | logits_id.reserve(n_logits); 382 | 383 | { 384 | const double scale = 1.0/temp; 385 | for (int i = 0; i < n_logits; ++i) { 386 | logits_id.push_back(std::make_pair(logits[i]*scale, i)); 387 | } 388 | } 389 | 390 | std::sort( 391 | logits_id.begin(), 392 | logits_id.end(), 393 | [](const std::pair & a, const std::pair & b) { 394 | return a.first > b.first; 395 | }); 396 | 397 | double maxl = -INFINITY; 398 | for (const auto & kv : logits_id) { 399 | maxl = std::max(maxl, kv.first); 400 | } 401 | 402 | // compute probs for the top K tokens 403 | std::vector probs; 404 | probs.reserve(logits_id.size()); 405 | 406 | double sum = 0.0; 407 | for (const auto & kv : logits_id) { 408 | double p = exp(kv.first - maxl); 409 | probs.push_back(p); 410 | sum += p; 411 | } 412 | 413 | // normalize the probs 414 | for (auto & p : probs) { 415 | p /= sum; 416 | } 417 | 418 | if (top_p < 1.0f) { 419 | double cumsum = 0.0f; 420 | for (int i = 0; i < (int) probs.size(); i++) { 421 | cumsum += probs[i]; 422 | if (cumsum >= top_p) { 423 | probs.resize(i + 1); 424 | logits_id.resize(i + 1); 425 | break; 426 | } 427 | } 428 | 429 | cumsum = 1.0/cumsum; 430 | for (int i = 0; i < (int) probs.size(); i++) { 431 | probs[i] *= cumsum; 432 | } 433 | } 434 | 435 | //printf("\n"); 436 | //for (int i = 0; i < (int) 10; i++) { 437 | // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); 438 | //} 439 | //printf("\n\n"); 440 | //exit(0); 441 | 442 | std::discrete_distribution<> dist(probs.begin(), probs.end()); 443 | int idx = dist(rng); 444 | 445 | return logits_id[idx].second; 446 | } 447 | 448 | 449 | size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) { 450 | const int nb = k / qk; 451 | const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2); 452 | const size_t row_size = nb*bs; 453 | 454 | assert(k % qk == 0); 455 | 456 | uint8_t pp[qk/2]; 457 | 458 | char * pdst = (char *) dst; 459 | 460 | for (int j = 0; j < n; j += k) { 461 | uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); 462 | uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); 463 | 464 | for (int i = 0; i < nb; i++) { 465 | float amax = 0.0f; // absolute max 466 | 467 | { 468 | for (int l = 0; l < qk; l++) { 469 | const float v = src[j + i*qk + l]; 470 | amax = std::max(amax, fabsf(v)); 471 | } 472 | 473 | const float d = amax / ((1 << 3) - 1); 474 | const float id = d ? 1.0f/d : 0.0f; 475 | 476 | *(float *) pd = d; 477 | pd += bs; 478 | 479 | for (int l = 0; l < qk; l += 2) { 480 | const float v0 = (src[j + i*qk + l + 0])*id; 481 | const float v1 = (src[j + i*qk + l + 1])*id; 482 | 483 | const uint8_t vi0 = ((int8_t) (round(v0))) + 8; 484 | const uint8_t vi1 = ((int8_t) (round(v1))) + 8; 485 | 486 | assert(vi0 >= 0 && vi0 < 16); 487 | assert(vi1 >= 0 && vi1 < 16); 488 | 489 | hist[vi0]++; 490 | hist[vi1]++; 491 | 492 | pp[l/2] = vi0 | (vi1 << 4); 493 | } 494 | 495 | memcpy(pb, pp, sizeof(pp)); 496 | pb += bs; 497 | } 498 | } 499 | } 500 | 501 | return (n/k)*row_size; 502 | } 503 | 504 | size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { 505 | const int nb = k / qk; 506 | const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2); 507 | 508 | assert(k % qk == 0); 509 | 510 | uint8_t pp[qk/2]; 511 | 512 | char * pdst = (char *) dst; 513 | 514 | for (int j = 0; j < n; j += k) { 515 | float * pm = (float *) (pdst + (j/k)*row_size); 516 | float * pd = (float *) (pm + nb); 517 | uint8_t * pb = (uint8_t *) (pd + nb); 518 | 519 | //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); 520 | 521 | for (int i = 0; i < nb; i++) { 522 | float min = std::numeric_limits::max(); 523 | float max = std::numeric_limits::min(); 524 | 525 | { 526 | for (int l = 0; l < qk; l++) { 527 | const float v = src[j + i*qk + l]; 528 | if (v < min) min = v; 529 | if (v > max) max = v; 530 | } 531 | 532 | const float d = (max - min) / ((1 << 4) - 1); 533 | const float id = d ? 1.0f/d : 0.0f; 534 | 535 | pm[i] = min; 536 | pd[i] = d; 537 | 538 | for (int l = 0; l < qk; l += 2) { 539 | const float v0 = (src[j + i*qk + l + 0] - min)*id; 540 | const float v1 = (src[j + i*qk + l + 1] - min)*id; 541 | 542 | const uint8_t vi0 = round(v0); 543 | const uint8_t vi1 = round(v1); 544 | 545 | assert(vi0 >= 0 && vi0 < 16); 546 | assert(vi1 >= 0 && vi1 < 16); 547 | 548 | hist[vi0]++; 549 | hist[vi1]++; 550 | 551 | pp[l/2] = vi0 | (vi1 << 4); 552 | } 553 | 554 | memcpy(pb + i*qk/2, pp, sizeof(pp)); 555 | } 556 | } 557 | } 558 | 559 | return (n/k)*row_size; 560 | } 561 | -------------------------------------------------------------------------------- /ggml.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // 4 | // GGML Tensor Library 5 | // 6 | // This documentation is still a work in progress. 7 | // If you wish some specific topics to be covered, feel free to drop a comment: 8 | // 9 | // https://github.com/ggerganov/whisper.cpp/issues/40 10 | // 11 | // ## Overview 12 | // 13 | // This library implements: 14 | // 15 | // - a set of tensor operations 16 | // - automatic differentiation 17 | // - basic optimization algorithms 18 | // 19 | // The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes, 20 | // but is not limited to, the following: 21 | // 22 | // - linear regression 23 | // - support vector machines 24 | // - neural networks 25 | // 26 | // The library allows the user to define a certain function using the available tensor operations. This function 27 | // definition is represented internally via a computation graph. Each tensor operation in the function definition 28 | // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the 29 | // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized 30 | // using one of the available optimization algorithms. 31 | // 32 | // For example, here we define the function: f(x) = a*x^2 + b 33 | // 34 | // { 35 | // struct ggml_init_params params = { 36 | // .mem_size = 16*1024*1024, 37 | // .mem_buffer = NULL, 38 | // }; 39 | // 40 | // // memory allocation happens here 41 | // struct ggml_context * ctx = ggml_init(params); 42 | // 43 | // struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 44 | // 45 | // ggml_set_param(ctx, x); // x is an input variable 46 | // 47 | // struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 48 | // struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 49 | // struct ggml_tensor * x2 = ggml_mul(ctx, x, x); 50 | // struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); 51 | // 52 | // ... 53 | // } 54 | // 55 | // Notice that the function definition above does not involve any actual computation. The computation is performed only 56 | // when the user explicitly requests it. For example, to compute the function's value at x = 2.0: 57 | // 58 | // { 59 | // ... 60 | // 61 | // struct ggml_cgraph gf = ggml_build_forward(f); 62 | // 63 | // // set the input variable and parameter values 64 | // ggml_set_f32(x, 2.0f); 65 | // ggml_set_f32(a, 3.0f); 66 | // ggml_set_f32(b, 4.0f); 67 | // 68 | // ggml_graph_compute(ctx0, &gf); 69 | // 70 | // printf("f = %f\n", ggml_get_f32_1d(f, 0)); 71 | // 72 | // ... 73 | // } 74 | // 75 | // The actual computation is performed in the ggml_graph_compute() function. 76 | // 77 | // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the 78 | // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know 79 | // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory 80 | // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was 81 | // actually needed. 82 | // 83 | // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic 84 | // differentiation and optimization algorithms. 85 | // 86 | // The described approach allows to define the function graph once and then compute its forward or backward graphs 87 | // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way 88 | // the user can avoid the memory allocation overhead at runtime. 89 | // 90 | // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class 91 | // citizens, but in theory the library can be extended to support FP8 and integer data types. 92 | // 93 | // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary 94 | // and binary operations. Most of the available operations fall into one of these two categories. With time, it became 95 | // clear that the library needs to support more complex operations. The way to support these operations is not clear 96 | // yet, but a few examples are demonstrated in the following operations: 97 | // 98 | // - ggml_permute() 99 | // - ggml_conv_1d_1s() 100 | // - ggml_conv_1d_2s() 101 | // 102 | // For each tensor operator, the library implements a forward and backward computation function. The forward function 103 | // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the 104 | // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a 105 | // calculus class, or watch the following video: 106 | // 107 | // What is Automatic Differentiation? 108 | // https://www.youtube.com/watch?v=wG_nF1awSSY 109 | // 110 | // 111 | // ## Tensor data (struct ggml_tensor) 112 | // 113 | // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of 114 | // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains 115 | // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: 116 | // 117 | // { 118 | // struct ggml_tensor * c = ggml_add(ctx, a, b); 119 | // 120 | // assert(c->src[0] == a); 121 | // assert(c->src[1] == b); 122 | // } 123 | // 124 | // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the 125 | // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows 126 | // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and 127 | // permutation. All tensor operations have to take the stride into account and not assume that the tensor is 128 | // contiguous in memory. 129 | // 130 | // The data of the tensor is accessed via the "data" pointer. For example: 131 | // 132 | // { 133 | // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); 134 | // 135 | // // a[1, 2] = 1.0f; 136 | // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; 137 | // 138 | // // a[2, 0] = 2.0f; 139 | // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; 140 | // 141 | // ... 142 | // } 143 | // 144 | // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. 145 | // 146 | // ## The matrix multiplication operator (ggml_mul_mat) 147 | // 148 | // TODO 149 | // 150 | // 151 | // ## Multi-threading 152 | // 153 | // TODO 154 | // 155 | // 156 | // ## Overview of ggml.c 157 | // 158 | // TODO 159 | // 160 | // 161 | // ## SIMD optimizations 162 | // 163 | // TODO 164 | // 165 | // 166 | // ## Debugging ggml 167 | // 168 | // TODO 169 | // 170 | // 171 | 172 | #ifdef __cplusplus 173 | extern "C" { 174 | #endif 175 | 176 | #include 177 | #include 178 | #include 179 | 180 | #define GGML_MAX_DIMS 4 181 | #define GGML_MAX_NODES 4096 182 | #define GGML_MAX_PARAMS 16 183 | #define GGML_MAX_CONTEXTS 64 184 | #define GGML_MAX_OPT 4 185 | 186 | #ifdef __ARM_NEON 187 | // we use the built-in 16-bit float type 188 | typedef __fp16 ggml_fp16_t; 189 | #else 190 | typedef uint16_t ggml_fp16_t; 191 | #endif 192 | 193 | // convert FP16 <-> FP32 194 | float ggml_fp16_to_fp32(ggml_fp16_t x); 195 | ggml_fp16_t ggml_fp32_to_fp16(float x); 196 | 197 | struct ggml_object; 198 | struct ggml_context; 199 | 200 | enum ggml_type { 201 | GGML_TYPE_Q4_0, 202 | GGML_TYPE_Q4_1, 203 | GGML_TYPE_I8, 204 | GGML_TYPE_I16, 205 | GGML_TYPE_I32, 206 | GGML_TYPE_F16, 207 | GGML_TYPE_F32, 208 | GGML_TYPE_COUNT, 209 | }; 210 | 211 | // available tensor operations: 212 | enum ggml_op { 213 | GGML_OP_NONE = 0, 214 | 215 | GGML_OP_DUP, 216 | GGML_OP_ADD, 217 | GGML_OP_SUB, 218 | GGML_OP_MUL, 219 | GGML_OP_DIV, 220 | GGML_OP_SQR, 221 | GGML_OP_SQRT, 222 | GGML_OP_SUM, 223 | GGML_OP_MEAN, 224 | GGML_OP_REPEAT, 225 | GGML_OP_ABS, 226 | GGML_OP_SGN, 227 | GGML_OP_NEG, 228 | GGML_OP_STEP, 229 | GGML_OP_RELU, 230 | GGML_OP_GELU, 231 | GGML_OP_SILU, 232 | GGML_OP_NORM, // normalize 233 | 234 | GGML_OP_MUL_MAT, 235 | 236 | GGML_OP_SCALE, 237 | GGML_OP_CPY, 238 | GGML_OP_RESHAPE, 239 | GGML_OP_VIEW, 240 | GGML_OP_PERMUTE, 241 | GGML_OP_TRANSPOSE, 242 | GGML_OP_GET_ROWS, 243 | GGML_OP_DIAG_MASK_INF, 244 | GGML_OP_SOFT_MAX, 245 | GGML_OP_ROPE, 246 | GGML_OP_CONV_1D_1S, 247 | GGML_OP_CONV_1D_2S, 248 | 249 | GGML_OP_FLASH_ATTN, 250 | GGML_OP_FLASH_FF, 251 | 252 | GGML_OP_COUNT, 253 | }; 254 | 255 | // n-dimensional tensor 256 | struct ggml_tensor { 257 | enum ggml_type type; 258 | 259 | int n_dims; 260 | int ne[GGML_MAX_DIMS]; // number of elements 261 | size_t nb[GGML_MAX_DIMS]; // stride in bytes: 262 | // nb[0] = sizeof(type) 263 | // nb[1] = nb[0] * ne[0] + padding 264 | // nb[i] = nb[i-1] * ne[i-1] 265 | 266 | // compute data 267 | enum ggml_op op; 268 | 269 | bool is_param; 270 | 271 | struct ggml_tensor * grad; 272 | struct ggml_tensor * src0; 273 | struct ggml_tensor * src1; 274 | struct ggml_tensor * opt[GGML_MAX_OPT]; 275 | 276 | // thread scheduling 277 | int n_tasks; 278 | 279 | // performance 280 | int perf_runs; 281 | int64_t perf_cycles; 282 | int64_t perf_time_us; 283 | 284 | void * data; 285 | char padding[8]; 286 | }; 287 | 288 | // computation graph 289 | struct ggml_cgraph { 290 | int n_nodes; 291 | int n_leafs; 292 | int n_threads; 293 | 294 | size_t work_size; 295 | struct ggml_tensor * work; 296 | 297 | struct ggml_tensor * nodes[GGML_MAX_NODES]; 298 | struct ggml_tensor * grads[GGML_MAX_NODES]; 299 | struct ggml_tensor * leafs[GGML_MAX_NODES]; 300 | 301 | // performance 302 | int perf_runs; 303 | int64_t perf_cycles; 304 | int64_t perf_time_us; 305 | }; 306 | 307 | // scratch buffer 308 | struct ggml_scratch { 309 | size_t offs; 310 | size_t size; 311 | void * data; 312 | }; 313 | 314 | struct ggml_init_params { 315 | // memory pool 316 | size_t mem_size; // bytes 317 | void * mem_buffer; // if NULL, memory will be allocated internally 318 | }; 319 | 320 | void ggml_time_init(void); // call this once at the beginning of the program 321 | int64_t ggml_time_ms(void); 322 | int64_t ggml_time_us(void); 323 | int64_t ggml_cycles(void); 324 | int64_t ggml_cycles_per_ms(void); 325 | 326 | void ggml_print_object (const struct ggml_object * obj); 327 | void ggml_print_objects(const struct ggml_context * ctx); 328 | 329 | int ggml_nelements(const struct ggml_tensor * tensor); 330 | size_t ggml_nbytes (const struct ggml_tensor * tensor); 331 | 332 | int ggml_blck_size (enum ggml_type type); 333 | size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block 334 | float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float 335 | 336 | size_t ggml_element_size(const struct ggml_tensor * tensor); 337 | 338 | struct ggml_context * ggml_init(struct ggml_init_params params); 339 | void ggml_free(struct ggml_context * ctx); 340 | 341 | size_t ggml_used_mem(const struct ggml_context * ctx); 342 | 343 | size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); 344 | 345 | struct ggml_tensor * ggml_new_tensor( 346 | struct ggml_context * ctx, 347 | enum ggml_type type, 348 | int n_dims, 349 | const int *ne); 350 | 351 | struct ggml_tensor * ggml_new_tensor_1d( 352 | struct ggml_context * ctx, 353 | enum ggml_type type, 354 | int ne0); 355 | 356 | struct ggml_tensor * ggml_new_tensor_2d( 357 | struct ggml_context * ctx, 358 | enum ggml_type type, 359 | int ne0, 360 | int ne1); 361 | 362 | struct ggml_tensor * ggml_new_tensor_3d( 363 | struct ggml_context * ctx, 364 | enum ggml_type type, 365 | int ne0, 366 | int ne1, 367 | int ne2); 368 | 369 | struct ggml_tensor * ggml_new_tensor_4d( 370 | struct ggml_context * ctx, 371 | enum ggml_type type, 372 | int ne0, 373 | int ne1, 374 | int ne2, 375 | int ne3); 376 | 377 | struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); 378 | struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); 379 | 380 | struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); 381 | struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); 382 | 383 | struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); 384 | struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); 385 | struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); 386 | 387 | int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); 388 | void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); 389 | 390 | float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); 391 | void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); 392 | 393 | void * ggml_get_data (const struct ggml_tensor * tensor); 394 | float * ggml_get_data_f32(const struct ggml_tensor * tensor); 395 | 396 | // 397 | // operations on tensors with backpropagation 398 | // 399 | 400 | struct ggml_tensor * ggml_dup( 401 | struct ggml_context * ctx, 402 | struct ggml_tensor * a); 403 | 404 | struct ggml_tensor * ggml_add( 405 | struct ggml_context * ctx, 406 | struct ggml_tensor * a, 407 | struct ggml_tensor * b); 408 | 409 | struct ggml_tensor * ggml_sub( 410 | struct ggml_context * ctx, 411 | struct ggml_tensor * a, 412 | struct ggml_tensor * b); 413 | 414 | struct ggml_tensor * ggml_mul( 415 | struct ggml_context * ctx, 416 | struct ggml_tensor * a, 417 | struct ggml_tensor * b); 418 | 419 | struct ggml_tensor * ggml_div( 420 | struct ggml_context * ctx, 421 | struct ggml_tensor * a, 422 | struct ggml_tensor * b); 423 | 424 | struct ggml_tensor * ggml_sqr( 425 | struct ggml_context * ctx, 426 | struct ggml_tensor * a); 427 | 428 | struct ggml_tensor * ggml_sqrt( 429 | struct ggml_context * ctx, 430 | struct ggml_tensor * a); 431 | 432 | // return scalar 433 | // TODO: compute sum along rows 434 | struct ggml_tensor * ggml_sum( 435 | struct ggml_context * ctx, 436 | struct ggml_tensor * a); 437 | 438 | // mean along rows 439 | struct ggml_tensor * ggml_mean( 440 | struct ggml_context * ctx, 441 | struct ggml_tensor * a); 442 | 443 | // if a is the same shape as b, and a is not parameter, return a 444 | // otherwise, return a new tensor: repeat(a) to fit in b 445 | struct ggml_tensor * ggml_repeat( 446 | struct ggml_context * ctx, 447 | struct ggml_tensor * a, 448 | struct ggml_tensor * b); 449 | 450 | struct ggml_tensor * ggml_abs( 451 | struct ggml_context * ctx, 452 | struct ggml_tensor * a); 453 | 454 | struct ggml_tensor * ggml_sgn( 455 | struct ggml_context * ctx, 456 | struct ggml_tensor * a); 457 | 458 | struct ggml_tensor * ggml_neg( 459 | struct ggml_context * ctx, 460 | struct ggml_tensor * a); 461 | 462 | struct ggml_tensor * ggml_step( 463 | struct ggml_context * ctx, 464 | struct ggml_tensor * a); 465 | 466 | struct ggml_tensor * ggml_relu( 467 | struct ggml_context * ctx, 468 | struct ggml_tensor * a); 469 | 470 | // TODO: double-check this computation is correct 471 | struct ggml_tensor * ggml_gelu( 472 | struct ggml_context * ctx, 473 | struct ggml_tensor * a); 474 | 475 | struct ggml_tensor * ggml_silu( 476 | struct ggml_context * ctx, 477 | struct ggml_tensor * a); 478 | 479 | // normalize along rows 480 | // TODO: eps is hardcoded to 1e-5 for now 481 | struct ggml_tensor * ggml_norm( 482 | struct ggml_context * ctx, 483 | struct ggml_tensor * a); 484 | 485 | // A: m rows, n columns 486 | // B: p rows, n columns (i.e. we transpose it internally) 487 | // result is m columns, p rows 488 | struct ggml_tensor * ggml_mul_mat( 489 | struct ggml_context * ctx, 490 | struct ggml_tensor * a, 491 | struct ggml_tensor * b); 492 | 493 | // 494 | // operations on tensors without backpropagation 495 | // 496 | 497 | // in-place, returns view(a) 498 | struct ggml_tensor * ggml_scale( 499 | struct ggml_context * ctx, 500 | struct ggml_tensor * a, 501 | struct ggml_tensor * b); 502 | 503 | // a -> b, return view(b) 504 | struct ggml_tensor * ggml_cpy( 505 | struct ggml_context * ctx, 506 | struct ggml_tensor * a, 507 | struct ggml_tensor * b); 508 | 509 | // return view(a), b specifies the new shape 510 | // TODO: when we start computing gradient, make a copy instead of view 511 | struct ggml_tensor * ggml_reshape( 512 | struct ggml_context * ctx, 513 | struct ggml_tensor * a, 514 | struct ggml_tensor * b); 515 | 516 | // return view(a) 517 | // TODO: when we start computing gradient, make a copy instead of view 518 | struct ggml_tensor * ggml_reshape_2d( 519 | struct ggml_context * ctx, 520 | struct ggml_tensor * a, 521 | int ne0, 522 | int ne1); 523 | 524 | // return view(a) 525 | // TODO: when we start computing gradient, make a copy instead of view 526 | struct ggml_tensor * ggml_reshape_3d( 527 | struct ggml_context * ctx, 528 | struct ggml_tensor * a, 529 | int ne0, 530 | int ne1, 531 | int ne2); 532 | 533 | // offset in bytes 534 | struct ggml_tensor * ggml_view_1d( 535 | struct ggml_context * ctx, 536 | struct ggml_tensor * a, 537 | int ne0, 538 | size_t offset); 539 | 540 | struct ggml_tensor * ggml_view_2d( 541 | struct ggml_context * ctx, 542 | struct ggml_tensor * a, 543 | int ne0, 544 | int ne1, 545 | size_t nb1, // row stride in bytes 546 | size_t offset); 547 | 548 | struct ggml_tensor * ggml_permute( 549 | struct ggml_context * ctx, 550 | struct ggml_tensor * a, 551 | int axis0, 552 | int axis1, 553 | int axis2, 554 | int axis3); 555 | 556 | // alias for ggml_permute(ctx, a, 1, 0, 2, 3) 557 | struct ggml_tensor * ggml_transpose( 558 | struct ggml_context * ctx, 559 | struct ggml_tensor * a); 560 | 561 | struct ggml_tensor * ggml_get_rows( 562 | struct ggml_context * ctx, 563 | struct ggml_tensor * a, 564 | struct ggml_tensor * b); 565 | 566 | // set elements above the diagonal to -INF 567 | // in-place, returns view(a) 568 | struct ggml_tensor * ggml_diag_mask_inf( 569 | struct ggml_context * ctx, 570 | struct ggml_tensor * a, 571 | int n_past); 572 | 573 | // in-place, returns view(a) 574 | struct ggml_tensor * ggml_soft_max( 575 | struct ggml_context * ctx, 576 | struct ggml_tensor * a); 577 | 578 | // rotary position embedding 579 | // in-place, returns view(a) 580 | // if mode == 1, skip n_past elements 581 | // TODO: avoid creating a new tensor every time 582 | struct ggml_tensor * ggml_rope( 583 | struct ggml_context * ctx, 584 | struct ggml_tensor * a, 585 | int n_past, 586 | int n_dims, 587 | int mode); 588 | 589 | // padding = 1 590 | // TODO: we don't support extra parameters for now 591 | // that's why we are hard-coding the stride, padding, and dilation 592 | // not great .. 593 | struct ggml_tensor * ggml_conv_1d_1s( 594 | struct ggml_context * ctx, 595 | struct ggml_tensor * a, 596 | struct ggml_tensor * b); 597 | 598 | struct ggml_tensor * ggml_conv_1d_2s( 599 | struct ggml_context * ctx, 600 | struct ggml_tensor * a, 601 | struct ggml_tensor * b); 602 | 603 | struct ggml_tensor * ggml_flash_attn( 604 | struct ggml_context * ctx, 605 | struct ggml_tensor * q, 606 | struct ggml_tensor * k, 607 | struct ggml_tensor * v, 608 | bool masked); 609 | 610 | struct ggml_tensor * ggml_flash_ff( 611 | struct ggml_context * ctx, 612 | struct ggml_tensor * a, 613 | struct ggml_tensor * b0, 614 | struct ggml_tensor * b1, 615 | struct ggml_tensor * c0, 616 | struct ggml_tensor * c1); 617 | 618 | // 619 | // automatic differentiation 620 | // 621 | 622 | void ggml_set_param( 623 | struct ggml_context * ctx, 624 | struct ggml_tensor * tensor); 625 | 626 | void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); 627 | 628 | struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); 629 | struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); 630 | 631 | void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); 632 | void ggml_graph_reset (struct ggml_cgraph * cgraph); 633 | 634 | // print info and performance information for the graph 635 | void ggml_graph_print(const struct ggml_cgraph * cgraph); 636 | 637 | // dump the graph into a file using the dot format 638 | void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); 639 | 640 | // 641 | // optimization 642 | // 643 | 644 | // optimization methods 645 | enum ggml_opt_type { 646 | GGML_OPT_ADAM, 647 | GGML_OPT_LBFGS, 648 | }; 649 | 650 | // linesearch methods 651 | enum ggml_linesearch { 652 | GGML_LINESEARCH_DEFAULT = 1, 653 | 654 | GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, 655 | GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, 656 | GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, 657 | }; 658 | 659 | // optimization return values 660 | enum ggml_opt_result { 661 | GGML_OPT_OK = 0, 662 | GGML_OPT_DID_NOT_CONVERGE, 663 | GGML_OPT_NO_CONTEXT, 664 | GGML_OPT_INVALID_WOLFE, 665 | GGML_OPT_FAIL, 666 | 667 | GGML_LINESEARCH_FAIL = -128, 668 | GGML_LINESEARCH_MINIMUM_STEP, 669 | GGML_LINESEARCH_MAXIMUM_STEP, 670 | GGML_LINESEARCH_MAXIMUM_ITERATIONS, 671 | GGML_LINESEARCH_INVALID_PARAMETERS, 672 | }; 673 | 674 | // optimization parameters 675 | // 676 | // see ggml.c (ggml_opt_default_params) for default values 677 | // 678 | struct ggml_opt_params { 679 | enum ggml_opt_type type; 680 | 681 | int n_threads; 682 | 683 | // delta-based convergence test 684 | // 685 | // if past == 0 - disabled 686 | // if past > 0: 687 | // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) 688 | // 689 | int past; 690 | float delta; 691 | 692 | // maximum number of iterations without improvement 693 | // 694 | // if 0 - disabled 695 | // if > 0: 696 | // assume convergence if no cost improvement in this number of iterations 697 | // 698 | int max_no_improvement; 699 | 700 | bool print_forward_graph; 701 | bool print_backward_graph; 702 | 703 | // ADAM parameters 704 | struct { 705 | int n_iter; 706 | 707 | float alpha; // learning rate 708 | float beta1; 709 | float beta2; 710 | float eps; // epsilon for numerical stability 711 | float eps_f; // epsilon for convergence test 712 | float eps_g; // epsilon for convergence test 713 | } adam; 714 | 715 | // LBFGS parameters 716 | struct { 717 | int m; // number of corrections to approximate the inv. Hessian 718 | int n_iter; 719 | int max_linesearch; 720 | 721 | float eps; // convergence tolerance 722 | float ftol; // line search tolerance 723 | float wolfe; 724 | float min_step; 725 | float max_step; 726 | 727 | enum ggml_linesearch linesearch; 728 | } lbfgs; 729 | }; 730 | 731 | struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); 732 | 733 | // optimize the function defined by the tensor f 734 | enum ggml_opt_result ggml_opt( 735 | struct ggml_context * ctx, 736 | struct ggml_opt_params params, 737 | struct ggml_tensor * f); 738 | 739 | // 740 | // system info 741 | // 742 | 743 | int ggml_cpu_has_avx(void); 744 | int ggml_cpu_has_avx2(void); 745 | int ggml_cpu_has_avx512(void); 746 | int ggml_cpu_has_fma(void); 747 | int ggml_cpu_has_neon(void); 748 | int ggml_cpu_has_arm_fma(void); 749 | int ggml_cpu_has_f16c(void); 750 | int ggml_cpu_has_fp16_va(void); 751 | int ggml_cpu_has_wasm_simd(void); 752 | int ggml_cpu_has_blas(void); 753 | int ggml_cpu_has_sse3(void); 754 | int ggml_cpu_has_vsx(void); 755 | 756 | #ifdef __cplusplus 757 | } 758 | #endif 759 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | 3 | #include "utils.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | // determine number of model parts based on the dimension 15 | static const std::map LLAMA_N_PARTS = { 16 | { 4096, 1 }, 17 | { 5120, 2 }, 18 | { 6656, 4 }, 19 | { 8192, 8 }, 20 | }; 21 | 22 | // default hparams (LLaMA 7B) 23 | struct llama_hparams { 24 | int32_t n_vocab = 32000; 25 | int32_t n_ctx = 512; // this is provided as user input? 26 | int32_t n_embd = 4096; 27 | int32_t n_mult = 256; 28 | int32_t n_head = 32; 29 | int32_t n_layer = 32; 30 | int32_t n_rot = 64; 31 | int32_t f16 = 1; 32 | }; 33 | 34 | struct llama_layer { 35 | // normalization 36 | struct ggml_tensor * attention_norm; 37 | 38 | // attention 39 | struct ggml_tensor * wq; 40 | struct ggml_tensor * wk; 41 | struct ggml_tensor * wv; 42 | struct ggml_tensor * wo; 43 | 44 | // normalization 45 | struct ggml_tensor * ffn_norm; 46 | 47 | // ff 48 | struct ggml_tensor * w1; 49 | struct ggml_tensor * w2; 50 | struct ggml_tensor * w3; 51 | }; 52 | 53 | struct llama_model { 54 | llama_hparams hparams; 55 | 56 | struct ggml_tensor * tok_embeddings; 57 | 58 | struct ggml_tensor * norm; 59 | struct ggml_tensor * output; 60 | 61 | std::vector layers; 62 | 63 | // key + value memory 64 | struct ggml_tensor * memory_k; 65 | struct ggml_tensor * memory_v; 66 | 67 | // 68 | struct ggml_context * ctx; 69 | std::map tensors; 70 | }; 71 | 72 | // load the model's weights from a file 73 | bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { 74 | printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); 75 | 76 | auto fin = std::ifstream(fname, std::ios::binary); 77 | if (!fin) { 78 | fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); 79 | return false; 80 | } 81 | 82 | // verify magic 83 | { 84 | uint32_t magic; 85 | fin.read((char *) &magic, sizeof(magic)); 86 | if (magic != 0x67676d6c) { 87 | fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); 88 | return false; 89 | } 90 | } 91 | 92 | int n_ff = 0; 93 | int n_parts = 0; 94 | 95 | // load hparams 96 | { 97 | auto & hparams = model.hparams; 98 | 99 | fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); 100 | //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); 101 | fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); 102 | fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); 103 | fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); 104 | fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); 105 | fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); 106 | fin.read((char *) &hparams.f16, sizeof(hparams.f16)); 107 | 108 | hparams.n_ctx = n_ctx; 109 | 110 | n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; 111 | n_parts = LLAMA_N_PARTS.at(hparams.n_embd); 112 | 113 | printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); 114 | printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); 115 | printf("%s: n_embd = %d\n", __func__, hparams.n_embd); 116 | printf("%s: n_mult = %d\n", __func__, hparams.n_mult); 117 | printf("%s: n_head = %d\n", __func__, hparams.n_head); 118 | printf("%s: n_layer = %d\n", __func__, hparams.n_layer); 119 | printf("%s: n_rot = %d\n", __func__, hparams.n_rot); 120 | printf("%s: f16 = %d\n", __func__, hparams.f16); 121 | printf("%s: n_ff = %d\n", __func__, n_ff); 122 | printf("%s: n_parts = %d\n", __func__, n_parts); 123 | } 124 | 125 | // load vocab 126 | { 127 | const int32_t n_vocab = model.hparams.n_vocab; 128 | 129 | if (n_vocab != model.hparams.n_vocab) { 130 | fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", 131 | __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); 132 | return false; 133 | } 134 | 135 | std::string word; 136 | for (int i = 0; i < n_vocab; i++) { 137 | uint32_t len; 138 | fin.read((char *) &len, sizeof(len)); 139 | 140 | word.resize(len); 141 | fin.read((char *) word.data(), len); 142 | 143 | vocab.token_to_id[word] = i; 144 | vocab.id_to_token[i] = word; 145 | 146 | //if (i < 30000) { 147 | // printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); 148 | //} 149 | } 150 | } 151 | 152 | // for the big tensors, we have the option to store the data in 16-bit floats or quantized 153 | // in order to save memory and also to speed up the computation 154 | ggml_type wtype = GGML_TYPE_COUNT; 155 | switch (model.hparams.f16) { 156 | case 0: wtype = GGML_TYPE_F32; break; 157 | case 1: wtype = GGML_TYPE_F16; break; 158 | case 2: wtype = GGML_TYPE_Q4_0; break; 159 | case 3: wtype = GGML_TYPE_Q4_1; break; 160 | default: 161 | { 162 | fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", 163 | __func__, fname.c_str(), model.hparams.f16); 164 | return false; 165 | } 166 | } 167 | 168 | const ggml_type wtype2 = GGML_TYPE_F32; 169 | 170 | auto & ctx = model.ctx; 171 | 172 | size_t ctx_size = 0; 173 | 174 | { 175 | const auto & hparams = model.hparams; 176 | 177 | const int n_embd = hparams.n_embd; 178 | const int n_layer = hparams.n_layer; 179 | const int n_ctx = hparams.n_ctx; 180 | const int n_vocab = hparams.n_vocab; 181 | 182 | ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // tok_embeddings 183 | 184 | ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm 185 | 186 | ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // output 187 | 188 | ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm 189 | 190 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq 191 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk 192 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv 193 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo 194 | 195 | ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm 196 | 197 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1 198 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 199 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 200 | 201 | ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k 202 | ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v 203 | 204 | ctx_size += (5 + 10*n_layer)*256; // object overhead 205 | 206 | printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); 207 | } 208 | 209 | // create the ggml context 210 | { 211 | struct ggml_init_params params = { 212 | .mem_size = ctx_size, 213 | .mem_buffer = NULL, 214 | }; 215 | 216 | model.ctx = ggml_init(params); 217 | if (!model.ctx) { 218 | fprintf(stderr, "%s: ggml_init() failed\n", __func__); 219 | return false; 220 | } 221 | } 222 | 223 | // prepare memory for the weights 224 | { 225 | const auto & hparams = model.hparams; 226 | 227 | const int n_embd = hparams.n_embd; 228 | const int n_layer = hparams.n_layer; 229 | const int n_ctx = hparams.n_ctx; 230 | const int n_vocab = hparams.n_vocab; 231 | 232 | model.layers.resize(n_layer); 233 | 234 | model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); 235 | 236 | model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 237 | model.output = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); 238 | 239 | // map by name 240 | model.tensors["tok_embeddings.weight"] = model.tok_embeddings; 241 | 242 | model.tensors["norm.weight"] = model.norm; 243 | model.tensors["output.weight"] = model.output; 244 | 245 | for (int i = 0; i < n_layer; ++i) { 246 | auto & layer = model.layers[i]; 247 | 248 | layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 249 | 250 | layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 251 | layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 252 | layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 253 | layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 254 | 255 | layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 256 | 257 | layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); 258 | layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd); 259 | layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); 260 | 261 | // map by name 262 | model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm; 263 | 264 | model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq; 265 | model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk; 266 | model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv; 267 | model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo; 268 | 269 | model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm; 270 | 271 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1; 272 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2; 273 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3; 274 | } 275 | } 276 | 277 | // key + value memory 278 | { 279 | const auto & hparams = model.hparams; 280 | 281 | const int n_embd = hparams.n_embd; 282 | const int n_layer = hparams.n_layer; 283 | const int n_ctx = hparams.n_ctx; 284 | 285 | const int n_mem = n_layer*n_ctx; 286 | const int n_elements = n_embd*n_mem; 287 | 288 | model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); 289 | model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); 290 | 291 | const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); 292 | 293 | printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); 294 | } 295 | 296 | const size_t file_offset = fin.tellg(); 297 | 298 | fin.close(); 299 | 300 | std::vector tmp; 301 | 302 | for (int i = 0; i < n_parts; ++i) { 303 | const int part_id = i; 304 | //const int part_id = n_parts - i - 1; 305 | 306 | std::string fname_part = fname; 307 | if (i > 0) { 308 | fname_part += "." + std::to_string(i); 309 | } 310 | 311 | printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); 312 | 313 | fin = std::ifstream(fname_part, std::ios::binary); 314 | fin.seekg(file_offset); 315 | 316 | // load weights 317 | { 318 | int n_tensors = 0; 319 | size_t total_size = 0; 320 | 321 | printf("%s: ", __func__); 322 | 323 | while (true) { 324 | int32_t n_dims; 325 | int32_t length; 326 | int32_t ftype; 327 | 328 | fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); 329 | fin.read(reinterpret_cast(&length), sizeof(length)); 330 | fin.read(reinterpret_cast(&ftype), sizeof(ftype)); 331 | 332 | if (fin.eof()) { 333 | break; 334 | } 335 | 336 | int32_t nelements = 1; 337 | int32_t ne[2] = { 1, 1 }; 338 | for (int i = 0; i < n_dims; ++i) { 339 | fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); 340 | nelements *= ne[i]; 341 | } 342 | 343 | std::string name(length, 0); 344 | fin.read(&name[0], length); 345 | 346 | if (model.tensors.find(name.data()) == model.tensors.end()) { 347 | fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); 348 | return false; 349 | } 350 | 351 | // split_type = 0: split by columns 352 | // split_type = 1: split by rows 353 | int split_type = 0; 354 | 355 | // split_type = 0: 356 | // regex: 357 | // - tok_embeddings.* 358 | // - layers.*.attention.wo.weight 359 | // - layers.*.feed_forward.w2.weight 360 | 361 | // split_type = 1: 362 | // regex: 363 | // - output.* 364 | // - layers.*.attention.wq.weight 365 | // - layers.*.attention.wk.weight 366 | // - layers.*.attention.wv.weight 367 | // - layers.*.feed_forward.w1.weight 368 | // - layers.*.feed_forward.w3.weight 369 | if (name.find("tok_embeddings") != std::string::npos) { 370 | split_type = 0; 371 | } else if (name.find("layers") != std::string::npos) { 372 | if (name.find("attention.wo.weight") != std::string::npos) { 373 | split_type = 0; 374 | } else if (name.find("feed_forward.w2.weight") != std::string::npos) { 375 | split_type = 0; 376 | } else { 377 | split_type = 1; 378 | } 379 | } else if (name.find("output") != std::string::npos) { 380 | split_type = 1; 381 | } 382 | 383 | auto tensor = model.tensors[name.data()]; 384 | 385 | if (n_dims == 1) { 386 | if (ggml_nelements(tensor) != nelements) { 387 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); 388 | return false; 389 | } 390 | } else { 391 | if (ggml_nelements(tensor)/n_parts != nelements) { 392 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); 393 | return false; 394 | } 395 | } 396 | 397 | if (n_dims == 1) { 398 | if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { 399 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 400 | __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); 401 | return false; 402 | } 403 | } else { 404 | if (split_type == 0) { 405 | if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) { 406 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 407 | __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]); 408 | return false; 409 | } 410 | } else { 411 | if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) { 412 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 413 | __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]); 414 | return false; 415 | } 416 | } 417 | } 418 | 419 | if (0) { 420 | static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; 421 | printf("%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type); 422 | } 423 | 424 | size_t bpe = 0; 425 | 426 | switch (ftype) { 427 | case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; 428 | case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; 429 | case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; 430 | case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; 431 | default: 432 | { 433 | fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); 434 | return false; 435 | } 436 | }; 437 | 438 | if (n_dims == 1 || n_parts == 1) { 439 | if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { 440 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", 441 | __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); 442 | return false; 443 | } 444 | 445 | if (part_id == 0) { 446 | fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); 447 | } else { 448 | fin.seekg(ggml_nbytes(tensor), std::ios::cur); 449 | } 450 | 451 | total_size += ggml_nbytes(tensor); 452 | } else { 453 | if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) { 454 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", 455 | __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe); 456 | return false; 457 | } 458 | 459 | if (split_type == 0) { 460 | const int np0 = ne[0]; 461 | 462 | const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 463 | assert(row_size == tensor->nb[1]); 464 | 465 | for (int i1 = 0; i1 < ne[1]; ++i1) { 466 | const size_t offset_row = i1*row_size; 467 | const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 468 | fin.read(reinterpret_cast(tensor->data) + offset, row_size/n_parts); 469 | } 470 | } else { 471 | const int np1 = ne[1]; 472 | 473 | const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 474 | 475 | for (int i1 = 0; i1 < ne[1]; ++i1) { 476 | const size_t offset_row = (i1 + part_id*np1)*row_size; 477 | fin.read(reinterpret_cast(tensor->data) + offset_row, row_size); 478 | } 479 | } 480 | 481 | total_size += ggml_nbytes(tensor)/n_parts; 482 | } 483 | 484 | //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); 485 | if (++n_tensors % 8 == 0) { 486 | printf("."); 487 | fflush(stdout); 488 | } 489 | } 490 | 491 | printf(" done\n"); 492 | 493 | printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); 494 | } 495 | 496 | fin.close(); 497 | } 498 | 499 | return true; 500 | } 501 | 502 | // evaluate the transformer 503 | // 504 | // - model: the model 505 | // - n_threads: number of threads to use 506 | // - n_past: the context size so far 507 | // - embd_inp: the embeddings of the tokens in the context 508 | // - embd_w: the predicted logits for the next token 509 | // 510 | // The GPT-J model requires about 16MB of memory per input token. 511 | // 512 | bool llama_eval( 513 | const llama_model & model, 514 | const int n_threads, 515 | const int n_past, 516 | const std::vector & embd_inp, 517 | std::vector & embd_w, 518 | size_t & mem_per_token) { 519 | const int N = embd_inp.size(); 520 | 521 | const auto & hparams = model.hparams; 522 | 523 | const int n_embd = hparams.n_embd; 524 | const int n_layer = hparams.n_layer; 525 | const int n_ctx = hparams.n_ctx; 526 | const int n_head = hparams.n_head; 527 | const int n_vocab = hparams.n_vocab; 528 | const int n_rot = hparams.n_embd/hparams.n_head; 529 | 530 | const int d_key = n_embd/n_head; 531 | 532 | static size_t buf_size = 512u*1024*1024; 533 | static void * buf = malloc(buf_size); 534 | 535 | if (mem_per_token > 0 && mem_per_token*N > buf_size) { 536 | const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead 537 | //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); 538 | 539 | // reallocate 540 | buf_size = buf_size_new; 541 | buf = realloc(buf, buf_size); 542 | if (buf == nullptr) { 543 | fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); 544 | return false; 545 | } 546 | } 547 | 548 | struct ggml_init_params params = { 549 | .mem_size = buf_size, 550 | .mem_buffer = buf, 551 | }; 552 | 553 | struct ggml_context * ctx0 = ggml_init(params); 554 | struct ggml_cgraph gf = { .n_threads = n_threads }; 555 | 556 | struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); 557 | memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); 558 | 559 | struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); 560 | 561 | for (int il = 0; il < n_layer; ++il) { 562 | struct ggml_tensor * inpSA = inpL; 563 | 564 | struct ggml_tensor * cur; 565 | 566 | // norm 567 | { 568 | cur = ggml_norm(ctx0, inpL); 569 | 570 | // cur = attention_norm*cur 571 | cur = ggml_mul(ctx0, 572 | ggml_repeat(ctx0, model.layers[il].attention_norm, cur), 573 | cur); 574 | } 575 | 576 | // self-attention 577 | { 578 | struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); 579 | struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); 580 | struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); 581 | 582 | // store key and value to memory 583 | if (N >= 1) { 584 | struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); 585 | struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); 586 | 587 | ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); 588 | ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); 589 | } 590 | 591 | // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) 592 | struct ggml_tensor * Q = 593 | ggml_permute(ctx0, 594 | ggml_rope(ctx0, 595 | ggml_cpy(ctx0, 596 | Qcur, 597 | ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), 598 | n_past, n_rot, 0), 599 | 0, 2, 1, 3); 600 | 601 | // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) 602 | struct ggml_tensor * K = 603 | ggml_permute(ctx0, 604 | ggml_rope(ctx0, 605 | ggml_reshape_3d(ctx0, 606 | ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), 607 | n_embd/n_head, n_head, n_past + N), 608 | n_past, n_rot, 1), 609 | 0, 2, 1, 3); 610 | 611 | // K * Q 612 | struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); 613 | 614 | // KQ_scaled = KQ / sqrt(n_embd/n_head) 615 | struct ggml_tensor * KQ_scaled = 616 | ggml_scale(ctx0, 617 | KQ, 618 | ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) 619 | ); 620 | 621 | // KQ_masked = mask_past(KQ_scaled) 622 | struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); 623 | 624 | // KQ = soft_max(KQ_masked) 625 | struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); 626 | 627 | // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() 628 | struct ggml_tensor * V_trans = 629 | ggml_permute(ctx0, 630 | ggml_reshape_3d(ctx0, 631 | ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), 632 | n_embd/n_head, n_head, n_past + N), 633 | 1, 2, 0, 3); 634 | 635 | // KQV = transpose(V) * KQ_soft_max 636 | struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); 637 | 638 | // KQV_merged = KQV.permute(0, 2, 1, 3) 639 | struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); 640 | 641 | // cur = KQV_merged.contiguous().view(n_embd, N) 642 | cur = ggml_cpy(ctx0, 643 | KQV_merged, 644 | ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); 645 | 646 | // projection (no bias) 647 | cur = ggml_mul_mat(ctx0, 648 | model.layers[il].wo, 649 | cur); 650 | } 651 | 652 | struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); 653 | 654 | // feed-forward network 655 | { 656 | // norm 657 | { 658 | cur = ggml_norm(ctx0, inpFF); 659 | 660 | // cur = ffn_norm*cur 661 | cur = ggml_mul(ctx0, 662 | ggml_repeat(ctx0, model.layers[il].ffn_norm, cur), 663 | cur); 664 | } 665 | 666 | struct ggml_tensor * tmp = ggml_mul_mat(ctx0, 667 | model.layers[il].w3, 668 | cur); 669 | 670 | 671 | cur = ggml_mul_mat(ctx0, 672 | model.layers[il].w1, 673 | cur); 674 | 675 | // SILU activation 676 | cur = ggml_silu(ctx0, cur); 677 | 678 | cur = ggml_mul(ctx0, cur, tmp); 679 | 680 | cur = ggml_mul_mat(ctx0, 681 | model.layers[il].w2, 682 | cur); 683 | } 684 | 685 | cur = ggml_add(ctx0, cur, inpFF); 686 | 687 | // input for next layer 688 | inpL = cur; 689 | } 690 | 691 | // norm 692 | { 693 | inpL = ggml_norm(ctx0, inpL); 694 | 695 | // inpL = norm*inpL 696 | inpL = ggml_mul(ctx0, 697 | ggml_repeat(ctx0, model.norm, inpL), 698 | inpL); 699 | } 700 | 701 | // lm_head 702 | { 703 | inpL = ggml_mul_mat(ctx0, model.output, inpL); 704 | } 705 | 706 | // logits -> probs 707 | //inpL = ggml_soft_max(ctx0, inpL); 708 | 709 | // run the computation 710 | ggml_build_forward_expand(&gf, inpL); 711 | ggml_graph_compute (ctx0, &gf); 712 | 713 | //if (n_past%100 == 0) { 714 | // ggml_graph_print (&gf); 715 | // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); 716 | //} 717 | 718 | //embd_w.resize(n_vocab*N); 719 | //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); 720 | 721 | // return result for just the last token 722 | embd_w.resize(n_vocab); 723 | memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); 724 | 725 | if (mem_per_token == 0) { 726 | mem_per_token = ggml_used_mem(ctx0)/N; 727 | } 728 | //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); 729 | 730 | ggml_free(ctx0); 731 | 732 | return true; 733 | } 734 | 735 | int main(int argc, char ** argv) { 736 | const int64_t t_main_start_us = ggml_time_us(); 737 | 738 | gpt_params params; 739 | params.model = "models/llama-7B/ggml-model.bin"; 740 | 741 | if (gpt_params_parse(argc, argv, params) == false) { 742 | return 1; 743 | } 744 | 745 | if (params.seed < 0) { 746 | params.seed = time(NULL); 747 | } 748 | 749 | printf("%s: seed = %d\n", __func__, params.seed); 750 | 751 | std::mt19937 rng(params.seed); 752 | if (params.prompt.empty()) { 753 | params.prompt = gpt_random_prompt(rng); 754 | } 755 | 756 | // params.prompt = R"(// this function checks if the number n is prime 757 | //bool is_prime(int n) {)"; 758 | 759 | int64_t t_load_us = 0; 760 | 761 | gpt_vocab vocab; 762 | llama_model model; 763 | 764 | // load the model 765 | { 766 | const int64_t t_start_us = ggml_time_us(); 767 | 768 | if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ?? 769 | fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); 770 | return 1; 771 | } 772 | 773 | t_load_us = ggml_time_us() - t_start_us; 774 | } 775 | 776 | int n_past = 0; 777 | 778 | int64_t t_sample_us = 0; 779 | int64_t t_predict_us = 0; 780 | 781 | std::vector logits; 782 | 783 | // tokenize the prompt 784 | std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); 785 | 786 | params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); 787 | 788 | printf("\n"); 789 | printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); 790 | printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); 791 | for (int i = 0; i < (int) embd_inp.size(); i++) { 792 | printf("%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); 793 | } 794 | printf("\n"); 795 | printf("sampling parameters: temp = %f, top_k = %d, top_p = %f\n", params.temp, params.top_k, params.top_p); 796 | printf("\n\n"); 797 | 798 | std::vector embd; 799 | 800 | // determine the required inference memory per token: 801 | size_t mem_per_token = 0; 802 | llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); 803 | 804 | for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { 805 | // predict 806 | if (embd.size() > 0) { 807 | const int64_t t_start_us = ggml_time_us(); 808 | 809 | if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { 810 | printf("Failed to predict\n"); 811 | return 1; 812 | } 813 | 814 | t_predict_us += ggml_time_us() - t_start_us; 815 | } 816 | 817 | n_past += embd.size(); 818 | embd.clear(); 819 | 820 | if (i >= embd_inp.size()) { 821 | // sample next token 822 | const float top_p = params.top_p; 823 | const float temp = params.temp; 824 | 825 | const int n_vocab = model.hparams.n_vocab; 826 | 827 | gpt_vocab::id id = 0; 828 | 829 | { 830 | const int64_t t_start_sample_us = ggml_time_us(); 831 | 832 | id = llama_sample_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_p, temp, rng); 833 | 834 | t_sample_us += ggml_time_us() - t_start_sample_us; 835 | } 836 | 837 | // add it to the context 838 | embd.push_back(id); 839 | } else { 840 | // if here, it means we are still processing the input prompt 841 | for (int k = i; k < embd_inp.size(); k++) { 842 | embd.push_back(embd_inp[k]); 843 | if (embd.size() > params.n_batch) { 844 | break; 845 | } 846 | } 847 | i += embd.size() - 1; 848 | } 849 | 850 | // display text 851 | for (auto id : embd) { 852 | printf("%s", vocab.id_to_token[id].c_str()); 853 | } 854 | fflush(stdout); 855 | 856 | // end of text token 857 | if (embd.back() == 2) { 858 | printf(" [end of text]\n"); 859 | break; 860 | } 861 | } 862 | 863 | // report timing 864 | { 865 | const int64_t t_main_end_us = ggml_time_us(); 866 | 867 | printf("\n\n"); 868 | printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); 869 | printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); 870 | printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); 871 | printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); 872 | printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); 873 | } 874 | 875 | ggml_free(model.ctx); 876 | 877 | return 0; 878 | } 879 | -------------------------------------------------------------------------------- /server.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | 3 | #include "utils.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "ws.h" 17 | 18 | // determine number of model parts based on the dimension 19 | static const std::map LLAMA_N_PARTS = { 20 | { 4096, 1 }, 21 | { 5120, 2 }, 22 | { 6656, 4 }, 23 | { 8192, 8 }, 24 | }; 25 | 26 | // default hparams (LLaMA 7B) 27 | struct llama_hparams { 28 | int32_t n_vocab = 32000; 29 | int32_t n_ctx = 512; // this is provided as user input? 30 | int32_t n_embd = 4096; 31 | int32_t n_mult = 256; 32 | int32_t n_head = 32; 33 | int32_t n_layer = 32; 34 | int32_t n_rot = 64; 35 | int32_t f16 = 1; 36 | }; 37 | 38 | struct llama_layer { 39 | // normalization 40 | struct ggml_tensor * attention_norm; 41 | 42 | // attention 43 | struct ggml_tensor * wq; 44 | struct ggml_tensor * wk; 45 | struct ggml_tensor * wv; 46 | struct ggml_tensor * wo; 47 | 48 | // normalization 49 | struct ggml_tensor * ffn_norm; 50 | 51 | // ff 52 | struct ggml_tensor * w1; 53 | struct ggml_tensor * w2; 54 | struct ggml_tensor * w3; 55 | }; 56 | 57 | struct llama_model { 58 | llama_hparams hparams; 59 | 60 | struct ggml_tensor * tok_embeddings; 61 | 62 | struct ggml_tensor * norm; 63 | struct ggml_tensor * output; 64 | 65 | std::vector layers; 66 | 67 | // key + value memory 68 | struct ggml_tensor * memory_k; 69 | struct ggml_tensor * memory_v; 70 | 71 | // 72 | struct ggml_context * ctx; 73 | std::map tensors; 74 | }; 75 | 76 | // load the model's weights from a file 77 | bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { 78 | printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); 79 | 80 | auto fin = std::ifstream(fname, std::ios::binary); 81 | if (!fin) { 82 | fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); 83 | return false; 84 | } 85 | 86 | // verify magic 87 | { 88 | uint32_t magic; 89 | fin.read((char *) &magic, sizeof(magic)); 90 | if (magic != 0x67676d6c) { 91 | fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); 92 | return false; 93 | } 94 | } 95 | 96 | int n_ff = 0; 97 | int n_parts = 0; 98 | 99 | // load hparams 100 | { 101 | auto & hparams = model.hparams; 102 | 103 | fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); 104 | //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); 105 | fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); 106 | fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); 107 | fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); 108 | fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); 109 | fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); 110 | fin.read((char *) &hparams.f16, sizeof(hparams.f16)); 111 | 112 | hparams.n_ctx = n_ctx; 113 | 114 | n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; 115 | n_parts = LLAMA_N_PARTS.at(hparams.n_embd); 116 | 117 | printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); 118 | printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); 119 | printf("%s: n_embd = %d\n", __func__, hparams.n_embd); 120 | printf("%s: n_mult = %d\n", __func__, hparams.n_mult); 121 | printf("%s: n_head = %d\n", __func__, hparams.n_head); 122 | printf("%s: n_layer = %d\n", __func__, hparams.n_layer); 123 | printf("%s: n_rot = %d\n", __func__, hparams.n_rot); 124 | printf("%s: f16 = %d\n", __func__, hparams.f16); 125 | printf("%s: n_ff = %d\n", __func__, n_ff); 126 | printf("%s: n_parts = %d\n", __func__, n_parts); 127 | } 128 | 129 | // load vocab 130 | { 131 | const int32_t n_vocab = model.hparams.n_vocab; 132 | 133 | if (n_vocab != model.hparams.n_vocab) { 134 | fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", 135 | __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); 136 | return false; 137 | } 138 | 139 | std::string word; 140 | for (int i = 0; i < n_vocab; i++) { 141 | uint32_t len; 142 | fin.read((char *) &len, sizeof(len)); 143 | 144 | word.resize(len); 145 | fin.read((char *) word.data(), len); 146 | 147 | vocab.token_to_id[word] = i; 148 | vocab.id_to_token[i] = word; 149 | 150 | //if (i < 30000) { 151 | // printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); 152 | //} 153 | } 154 | } 155 | 156 | // for the big tensors, we have the option to store the data in 16-bit floats or quantized 157 | // in order to save memory and also to speed up the computation 158 | ggml_type wtype = GGML_TYPE_COUNT; 159 | switch (model.hparams.f16) { 160 | case 0: wtype = GGML_TYPE_F32; break; 161 | case 1: wtype = GGML_TYPE_F16; break; 162 | case 2: wtype = GGML_TYPE_Q4_0; break; 163 | case 3: wtype = GGML_TYPE_Q4_1; break; 164 | default: 165 | { 166 | fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", 167 | __func__, fname.c_str(), model.hparams.f16); 168 | return false; 169 | } 170 | } 171 | 172 | const ggml_type wtype2 = GGML_TYPE_F32; 173 | 174 | auto & ctx = model.ctx; 175 | 176 | size_t ctx_size = 0; 177 | 178 | { 179 | const auto & hparams = model.hparams; 180 | 181 | const int n_embd = hparams.n_embd; 182 | const int n_layer = hparams.n_layer; 183 | const int n_ctx = hparams.n_ctx; 184 | const int n_vocab = hparams.n_vocab; 185 | 186 | ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // tok_embeddings 187 | 188 | ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm 189 | 190 | ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // output 191 | 192 | ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm 193 | 194 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq 195 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk 196 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv 197 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo 198 | 199 | ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm 200 | 201 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1 202 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 203 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 204 | 205 | ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k 206 | ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v 207 | 208 | ctx_size += (5 + 10*n_layer)*256; // object overhead 209 | 210 | printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); 211 | } 212 | 213 | // create the ggml context 214 | { 215 | struct ggml_init_params params = { 216 | .mem_size = ctx_size, 217 | .mem_buffer = NULL, 218 | }; 219 | 220 | model.ctx = ggml_init(params); 221 | if (!model.ctx) { 222 | fprintf(stderr, "%s: ggml_init() failed\n", __func__); 223 | return false; 224 | } 225 | } 226 | 227 | // prepare memory for the weights 228 | { 229 | const auto & hparams = model.hparams; 230 | 231 | const int n_embd = hparams.n_embd; 232 | const int n_layer = hparams.n_layer; 233 | const int n_ctx = hparams.n_ctx; 234 | const int n_vocab = hparams.n_vocab; 235 | 236 | model.layers.resize(n_layer); 237 | 238 | model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); 239 | 240 | model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 241 | model.output = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); 242 | 243 | // map by name 244 | model.tensors["tok_embeddings.weight"] = model.tok_embeddings; 245 | 246 | model.tensors["norm.weight"] = model.norm; 247 | model.tensors["output.weight"] = model.output; 248 | 249 | for (int i = 0; i < n_layer; ++i) { 250 | auto & layer = model.layers[i]; 251 | 252 | layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 253 | 254 | layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 255 | layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 256 | layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 257 | layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 258 | 259 | layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 260 | 261 | layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); 262 | layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd); 263 | layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); 264 | 265 | // map by name 266 | model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm; 267 | 268 | model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq; 269 | model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk; 270 | model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv; 271 | model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo; 272 | 273 | model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm; 274 | 275 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1; 276 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2; 277 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3; 278 | } 279 | } 280 | 281 | // key + value memory 282 | { 283 | const auto & hparams = model.hparams; 284 | 285 | const int n_embd = hparams.n_embd; 286 | const int n_layer = hparams.n_layer; 287 | const int n_ctx = hparams.n_ctx; 288 | 289 | const int n_mem = n_layer*n_ctx; 290 | const int n_elements = n_embd*n_mem; 291 | 292 | model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); 293 | model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); 294 | 295 | const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); 296 | 297 | printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); 298 | } 299 | 300 | const size_t file_offset = fin.tellg(); 301 | 302 | fin.close(); 303 | 304 | std::vector tmp; 305 | 306 | for (int i = 0; i < n_parts; ++i) { 307 | const int part_id = i; 308 | //const int part_id = n_parts - i - 1; 309 | 310 | std::string fname_part = fname; 311 | if (i > 0) { 312 | fname_part += "." + std::to_string(i); 313 | } 314 | 315 | printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); 316 | 317 | fin = std::ifstream(fname_part, std::ios::binary); 318 | fin.seekg(file_offset); 319 | 320 | // load weights 321 | { 322 | int n_tensors = 0; 323 | size_t total_size = 0; 324 | 325 | printf("%s: ", __func__); 326 | 327 | while (true) { 328 | int32_t n_dims; 329 | int32_t length; 330 | int32_t ftype; 331 | 332 | fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); 333 | fin.read(reinterpret_cast(&length), sizeof(length)); 334 | fin.read(reinterpret_cast(&ftype), sizeof(ftype)); 335 | 336 | if (fin.eof()) { 337 | break; 338 | } 339 | 340 | int32_t nelements = 1; 341 | int32_t ne[2] = { 1, 1 }; 342 | for (int i = 0; i < n_dims; ++i) { 343 | fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); 344 | nelements *= ne[i]; 345 | } 346 | 347 | std::string name(length, 0); 348 | fin.read(&name[0], length); 349 | 350 | if (model.tensors.find(name.data()) == model.tensors.end()) { 351 | fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); 352 | return false; 353 | } 354 | 355 | // split_type = 0: split by columns 356 | // split_type = 1: split by rows 357 | int split_type = 0; 358 | 359 | // split_type = 0: 360 | // regex: 361 | // - tok_embeddings.* 362 | // - layers.*.attention.wo.weight 363 | // - layers.*.feed_forward.w2.weight 364 | 365 | // split_type = 1: 366 | // regex: 367 | // - output.* 368 | // - layers.*.attention.wq.weight 369 | // - layers.*.attention.wk.weight 370 | // - layers.*.attention.wv.weight 371 | // - layers.*.feed_forward.w1.weight 372 | // - layers.*.feed_forward.w3.weight 373 | if (name.find("tok_embeddings") != std::string::npos) { 374 | split_type = 0; 375 | } else if (name.find("layers") != std::string::npos) { 376 | if (name.find("attention.wo.weight") != std::string::npos) { 377 | split_type = 0; 378 | } else if (name.find("feed_forward.w2.weight") != std::string::npos) { 379 | split_type = 0; 380 | } else { 381 | split_type = 1; 382 | } 383 | } else if (name.find("output") != std::string::npos) { 384 | split_type = 1; 385 | } 386 | 387 | auto tensor = model.tensors[name.data()]; 388 | 389 | if (n_dims == 1) { 390 | if (ggml_nelements(tensor) != nelements) { 391 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); 392 | return false; 393 | } 394 | } else { 395 | if (ggml_nelements(tensor)/n_parts != nelements) { 396 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); 397 | return false; 398 | } 399 | } 400 | 401 | if (n_dims == 1) { 402 | if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { 403 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 404 | __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); 405 | return false; 406 | } 407 | } else { 408 | if (split_type == 0) { 409 | if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) { 410 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 411 | __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]); 412 | return false; 413 | } 414 | } else { 415 | if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) { 416 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 417 | __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]); 418 | return false; 419 | } 420 | } 421 | } 422 | 423 | if (0) { 424 | static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; 425 | printf("%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type); 426 | } 427 | 428 | size_t bpe = 0; 429 | 430 | switch (ftype) { 431 | case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; 432 | case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; 433 | case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; 434 | case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; 435 | default: 436 | { 437 | fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); 438 | return false; 439 | } 440 | }; 441 | 442 | if (n_dims == 1 || n_parts == 1) { 443 | if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { 444 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", 445 | __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); 446 | return false; 447 | } 448 | 449 | if (part_id == 0) { 450 | fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); 451 | } else { 452 | fin.seekg(ggml_nbytes(tensor), std::ios::cur); 453 | } 454 | 455 | total_size += ggml_nbytes(tensor); 456 | } else { 457 | if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) { 458 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", 459 | __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe); 460 | return false; 461 | } 462 | 463 | if (split_type == 0) { 464 | const int np0 = ne[0]; 465 | 466 | const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 467 | assert(row_size == tensor->nb[1]); 468 | 469 | for (int i1 = 0; i1 < ne[1]; ++i1) { 470 | const size_t offset_row = i1*row_size; 471 | const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 472 | fin.read(reinterpret_cast(tensor->data) + offset, row_size/n_parts); 473 | } 474 | } else { 475 | const int np1 = ne[1]; 476 | 477 | const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 478 | 479 | for (int i1 = 0; i1 < ne[1]; ++i1) { 480 | const size_t offset_row = (i1 + part_id*np1)*row_size; 481 | fin.read(reinterpret_cast(tensor->data) + offset_row, row_size); 482 | } 483 | } 484 | 485 | total_size += ggml_nbytes(tensor)/n_parts; 486 | } 487 | 488 | //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); 489 | if (++n_tensors % 8 == 0) { 490 | printf("."); 491 | fflush(stdout); 492 | } 493 | } 494 | 495 | printf(" done\n"); 496 | 497 | printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); 498 | } 499 | 500 | fin.close(); 501 | } 502 | 503 | return true; 504 | } 505 | 506 | // evaluate the transformer 507 | // 508 | // - model: the model 509 | // - n_threads: number of threads to use 510 | // - n_past: the context size so far 511 | // - embd_inp: the embeddings of the tokens in the context 512 | // - embd_w: the predicted logits for the next token 513 | // 514 | // The GPT-J model requires about 16MB of memory per input token. 515 | // 516 | bool llama_eval( 517 | const llama_model & model, 518 | const int n_threads, 519 | const int n_past, 520 | const std::vector & embd_inp, 521 | std::vector & embd_w, 522 | size_t & mem_per_token) { 523 | const int N = embd_inp.size(); 524 | 525 | const auto & hparams = model.hparams; 526 | 527 | const int n_embd = hparams.n_embd; 528 | const int n_layer = hparams.n_layer; 529 | const int n_ctx = hparams.n_ctx; 530 | const int n_head = hparams.n_head; 531 | const int n_vocab = hparams.n_vocab; 532 | const int n_rot = hparams.n_embd/hparams.n_head; 533 | 534 | const int d_key = n_embd/n_head; 535 | 536 | static size_t buf_size = 512u*1024*1024; 537 | static void * buf = malloc(buf_size); 538 | 539 | if (mem_per_token > 0 && mem_per_token*N > buf_size) { 540 | const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead 541 | //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); 542 | 543 | // reallocate 544 | buf_size = buf_size_new; 545 | buf = realloc(buf, buf_size); 546 | if (buf == nullptr) { 547 | fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); 548 | return false; 549 | } 550 | } 551 | 552 | struct ggml_init_params params = { 553 | .mem_size = buf_size, 554 | .mem_buffer = buf, 555 | }; 556 | 557 | struct ggml_context * ctx0 = ggml_init(params); 558 | struct ggml_cgraph gf = { .n_threads = n_threads }; 559 | 560 | struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); 561 | memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); 562 | 563 | struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); 564 | 565 | for (int il = 0; il < n_layer; ++il) { 566 | struct ggml_tensor * inpSA = inpL; 567 | 568 | struct ggml_tensor * cur; 569 | 570 | // norm 571 | { 572 | cur = ggml_norm(ctx0, inpL); 573 | 574 | // cur = attention_norm*cur 575 | cur = ggml_mul(ctx0, 576 | ggml_repeat(ctx0, model.layers[il].attention_norm, cur), 577 | cur); 578 | } 579 | 580 | // self-attention 581 | { 582 | struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); 583 | struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); 584 | struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); 585 | 586 | // store key and value to memory 587 | if (N >= 1) { 588 | struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); 589 | struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); 590 | 591 | ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); 592 | ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); 593 | } 594 | 595 | // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) 596 | struct ggml_tensor * Q = 597 | ggml_permute(ctx0, 598 | ggml_rope(ctx0, 599 | ggml_cpy(ctx0, 600 | Qcur, 601 | ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), 602 | n_past, n_rot, 0), 603 | 0, 2, 1, 3); 604 | 605 | // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) 606 | struct ggml_tensor * K = 607 | ggml_permute(ctx0, 608 | ggml_rope(ctx0, 609 | ggml_reshape_3d(ctx0, 610 | ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), 611 | n_embd/n_head, n_head, n_past + N), 612 | n_past, n_rot, 1), 613 | 0, 2, 1, 3); 614 | 615 | // K * Q 616 | struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); 617 | 618 | // KQ_scaled = KQ / sqrt(n_embd/n_head) 619 | struct ggml_tensor * KQ_scaled = 620 | ggml_scale(ctx0, 621 | KQ, 622 | ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) 623 | ); 624 | 625 | // KQ_masked = mask_past(KQ_scaled) 626 | struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); 627 | 628 | // KQ = soft_max(KQ_masked) 629 | struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); 630 | 631 | // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() 632 | struct ggml_tensor * V_trans = 633 | ggml_permute(ctx0, 634 | ggml_reshape_3d(ctx0, 635 | ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), 636 | n_embd/n_head, n_head, n_past + N), 637 | 1, 2, 0, 3); 638 | 639 | // KQV = transpose(V) * KQ_soft_max 640 | struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); 641 | 642 | // KQV_merged = KQV.permute(0, 2, 1, 3) 643 | struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); 644 | 645 | // cur = KQV_merged.contiguous().view(n_embd, N) 646 | cur = ggml_cpy(ctx0, 647 | KQV_merged, 648 | ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); 649 | 650 | // projection (no bias) 651 | cur = ggml_mul_mat(ctx0, 652 | model.layers[il].wo, 653 | cur); 654 | } 655 | 656 | struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); 657 | 658 | // feed-forward network 659 | { 660 | // norm 661 | { 662 | cur = ggml_norm(ctx0, inpFF); 663 | 664 | // cur = ffn_norm*cur 665 | cur = ggml_mul(ctx0, 666 | ggml_repeat(ctx0, model.layers[il].ffn_norm, cur), 667 | cur); 668 | } 669 | 670 | struct ggml_tensor * tmp = ggml_mul_mat(ctx0, 671 | model.layers[il].w3, 672 | cur); 673 | 674 | 675 | cur = ggml_mul_mat(ctx0, 676 | model.layers[il].w1, 677 | cur); 678 | 679 | // SILU activation 680 | cur = ggml_silu(ctx0, cur); 681 | 682 | cur = ggml_mul(ctx0, cur, tmp); 683 | 684 | cur = ggml_mul_mat(ctx0, 685 | model.layers[il].w2, 686 | cur); 687 | } 688 | 689 | cur = ggml_add(ctx0, cur, inpFF); 690 | 691 | // input for next layer 692 | inpL = cur; 693 | } 694 | 695 | // norm 696 | { 697 | inpL = ggml_norm(ctx0, inpL); 698 | 699 | // inpL = norm*inpL 700 | inpL = ggml_mul(ctx0, 701 | ggml_repeat(ctx0, model.norm, inpL), 702 | inpL); 703 | } 704 | 705 | // lm_head 706 | { 707 | inpL = ggml_mul_mat(ctx0, model.output, inpL); 708 | } 709 | 710 | // logits -> probs 711 | //inpL = ggml_soft_max(ctx0, inpL); 712 | 713 | // run the computation 714 | ggml_build_forward_expand(&gf, inpL); 715 | ggml_graph_compute (ctx0, &gf); 716 | 717 | //if (n_past%100 == 0) { 718 | // ggml_graph_print (&gf); 719 | // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); 720 | //} 721 | 722 | //embd_w.resize(n_vocab*N); 723 | //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); 724 | 725 | // return result for just the last token 726 | embd_w.resize(n_vocab); 727 | memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); 728 | 729 | if (mem_per_token == 0) { 730 | mem_per_token = ggml_used_mem(ctx0)/N; 731 | } 732 | //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); 733 | 734 | ggml_free(ctx0); 735 | 736 | return true; 737 | } 738 | 739 | // globals 740 | gpt_vocab vocab; 741 | llama_model model; 742 | gpt_params params; 743 | int n_past = 0; 744 | int64_t t_sample_us = 0; 745 | int64_t t_predict_us = 0; 746 | int64_t t_load_us = 0; 747 | 748 | void onopen(ws_cli_conn_t *client) 749 | { 750 | char *cli; 751 | cli = ws_getaddress(client); 752 | printf("Connection opened, addr: %s\n", cli); 753 | } 754 | 755 | void onclose(ws_cli_conn_t *client) 756 | { 757 | char *cli; 758 | cli = ws_getaddress(client); 759 | printf("Connection closed, addr: %s\n", cli); 760 | } 761 | 762 | void onmessage(ws_cli_conn_t *client, 763 | const unsigned char *msg, uint64_t size, int type) 764 | { 765 | char *cli; 766 | cli = ws_getaddress(client); 767 | 768 | // The first line contains the parameters, in this format: 769 | // temperature:top_p:top_k:max_tokens 770 | // This terminates in a newline. The prompt follows. 771 | 772 | char *p = (char *) msg; 773 | char *q = strchr(p, '\n'); 774 | if (q == NULL) { 775 | printf("Invalid parameters: %s\n", p); 776 | return; 777 | } 778 | *q = '\0'; 779 | q++; 780 | 781 | printf("Parameters: %s\n", p); 782 | 783 | char *r = strchr(p, ':'); 784 | if (r == NULL) { 785 | printf("Invalid parameters: %s\n", p); 786 | return; 787 | } 788 | *r = '\0'; 789 | r++; 790 | params.temp = atof(p); 791 | 792 | p = strchr(r, ':'); 793 | if (p == NULL) { 794 | printf("Invalid parameters: %s\n", r); 795 | return; 796 | } 797 | *p = '\0'; 798 | p++; 799 | params.top_p = atof(r); 800 | 801 | r = strchr(p, ':'); 802 | if (r == NULL) { 803 | printf("Invalid parameters: %s\n", p); 804 | return; 805 | } 806 | *r = '\0'; 807 | r++; 808 | params.top_k = atoi(p); 809 | 810 | params.n_predict = atoi(r); 811 | 812 | // Print the params 813 | printf("temp: %f, top_p: %f, top_k: %d, n_predict: %d\n", 814 | params.temp, params.top_p, params.top_k, params.n_predict); 815 | 816 | // Now get the prompt 817 | msg = (const unsigned char *) q; 818 | 819 | printf("Prompt received: \"%s\" from: %s\n", msg, cli); 820 | 821 | params.prompt = (char *) msg; 822 | 823 | std::vector logits; 824 | std::mt19937 rng(params.seed); 825 | 826 | // tokenize the prompt 827 | std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); 828 | 829 | params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); 830 | 831 | printf("\n"); 832 | printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); 833 | printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); 834 | for (int i = 0; i < (int) embd_inp.size(); i++) { 835 | printf("%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); 836 | } 837 | printf("\n"); 838 | printf("sampling parameters: temp = %f, top_k = %d, top_p = %f\n", params.temp, params.top_k, params.top_p); 839 | printf("\n\n"); 840 | 841 | std::vector embd; 842 | 843 | // determine the required inference memory per token: 844 | size_t mem_per_token = 0; 845 | llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); 846 | 847 | for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { 848 | // predict 849 | if (embd.size() > 0) { 850 | const int64_t t_start_us = ggml_time_us(); 851 | 852 | if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { 853 | printf("Failed to predict\n"); 854 | return; 855 | } 856 | 857 | t_predict_us += ggml_time_us() - t_start_us; 858 | } 859 | 860 | n_past += embd.size(); 861 | embd.clear(); 862 | 863 | if (i >= embd_inp.size()) { 864 | // sample next token 865 | const float top_p = params.top_p; 866 | const float temp = params.temp; 867 | 868 | const int n_vocab = model.hparams.n_vocab; 869 | 870 | gpt_vocab::id id = 0; 871 | 872 | { 873 | const int64_t t_start_sample_us = ggml_time_us(); 874 | 875 | id = llama_sample_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_p, temp, rng); 876 | 877 | t_sample_us += ggml_time_us() - t_start_sample_us; 878 | } 879 | 880 | // add it to the context 881 | embd.push_back(id); 882 | 883 | // display text 884 | for (auto id : embd) { 885 | ws_sendframe_txt(client, vocab.id_to_token[id].c_str()); 886 | printf("%s", vocab.id_to_token[id].c_str()); 887 | } 888 | fflush(stdout); 889 | } else { 890 | // if here, it means we are still processing the input prompt 891 | for (int k = i; k < embd_inp.size(); k++) { 892 | embd.push_back(embd_inp[k]); 893 | if (embd.size() > params.n_batch) { 894 | break; 895 | } 896 | } 897 | i += embd.size() - 1; 898 | } 899 | 900 | // end of text token 901 | if (embd.back() == 2) { 902 | printf(" [end of text]\n"); 903 | break; 904 | } 905 | } 906 | 907 | ws_close_client(client); 908 | } 909 | 910 | int main(int argc, char ** argv) { 911 | const int64_t t_main_start_us = ggml_time_us(); 912 | 913 | params.model = "models/llama-7B/ggml-model.bin"; 914 | 915 | if (gpt_params_parse(argc, argv, params) == false) { 916 | return 1; 917 | } 918 | 919 | if (params.seed < 0) { 920 | params.seed = time(NULL); 921 | } 922 | 923 | printf("%s: seed = %d\n", __func__, params.seed); 924 | 925 | std::mt19937 rng(params.seed); 926 | if (params.prompt.empty()) { 927 | params.prompt = gpt_random_prompt(rng); 928 | } 929 | 930 | // load the model 931 | { 932 | const int64_t t_start_us = ggml_time_us(); 933 | 934 | if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ?? 935 | fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); 936 | return 1; 937 | } 938 | 939 | t_load_us = ggml_time_us() - t_start_us; 940 | } 941 | 942 | struct ws_events evs; 943 | evs.onopen = &onopen; 944 | evs.onclose = &onclose; 945 | evs.onmessage = &onmessage; 946 | 947 | /* 948 | * Main loop, this function never* returns. 949 | * 950 | * *If the third argument is != 0, a new thread is created 951 | * to handle new connections. 952 | */ 953 | ws_socket(&evs, 10811, 0, 1000); 954 | return 0; 955 | } 956 | --------------------------------------------------------------------------------