├── doc ├── chat_ll2.jpg ├── gen_ll2.jpg └── linux_build.txt ├── src ├── vs_inc │ ├── stdbool.h │ └── stdint.h ├── model │ ├── load │ │ ├── load_tokenizer.c │ │ ├── load_transformer.h │ │ ├── load_tokenizer.h │ │ └── json.h │ ├── omp_numa.h │ ├── sampler.h │ ├── tr_opt_inc.c │ ├── tokenizer.h │ ├── kv_cache.c │ └── model.h ├── matmul │ ├── tr_opt_simd.h │ ├── w_types.h │ ├── matmul_f32.c │ ├── matmul_priv.h │ ├── mm_hsum.h │ ├── matmul.h │ ├── matmul_f16.c │ ├── matmul_bf16.c │ ├── tr_opt_simd.c │ └── matmul_sf16.c ├── utils │ ├── mem_alloc.h │ ├── utf8.h │ ├── time_ev.h │ ├── time_ev.c │ ├── term_utf8.h │ ├── numa.h │ ├── l_util.h │ ├── mem_alloc.c │ ├── numa_w.c │ └── utf8.c ├── dir_info.txt ├── main.c └── generate.c ├── tests ├── 1_node │ ├── gen_f12 │ │ ├── gen_vigogne2.txt │ │ ├── gen_codellama.txt │ │ ├── gen_llama1.txt │ │ ├── gen_tinyllama.txt │ │ ├── gen_llama2.txt │ │ ├── gen_zephyr.txt │ │ ├── gen_llama3.txt │ │ ├── gen_mistral.txt │ │ ├── gen_llama31.txt │ │ └── gen_qwen2.txt │ ├── gen_ref │ │ ├── gen_vigogne2.txt │ │ ├── gen_codellama.txt │ │ ├── gen_llama1.txt │ │ ├── gen_llama2.txt │ │ ├── gen_tinyllama.txt │ │ ├── gen_mathstral_fp32.txt │ │ ├── gen_llama3.txt │ │ ├── gen_mistral.txt │ │ ├── gen_zephyr.txt │ │ ├── gen_mixtral_f8.txt │ │ └── gen_qwen2.txt │ └── res_1socket.txt ├── dir_info.txt └── 2_nodes │ ├── res_2sockets.txt │ └── llama2_ht_off.txt ├── LICENSE ├── llama_st.sln ├── make_gcc.txt └── run_json ├── run_llama1.json ├── run_codellama.json ├── run_mathstral.json ├── run_mistral.json ├── run_tinyllama.json ├── run_zephyr.json ├── run_llama2.json ├── run_mixtral.json ├── run_vigogne2.json └── run_llama3.json /doc/chat_ll2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierrel55/llama_st/HEAD/doc/chat_ll2.jpg -------------------------------------------------------------------------------- /doc/gen_ll2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierrel55/llama_st/HEAD/doc/gen_ll2.jpg -------------------------------------------------------------------------------- /src/vs_inc/stdbool.h: -------------------------------------------------------------------------------- 1 | typedef unsigned int bool; 2 | #define true 1 3 | #define false 0 4 | -------------------------------------------------------------------------------- /src/model/load/load_tokenizer.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierrel55/llama_st/HEAD/src/model/load/load_tokenizer.c -------------------------------------------------------------------------------- /tests/1_node/gen_f12/gen_vigogne2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierrel55/llama_st/HEAD/tests/1_node/gen_f12/gen_vigogne2.txt -------------------------------------------------------------------------------- /tests/1_node/gen_ref/gen_vigogne2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierrel55/llama_st/HEAD/tests/1_node/gen_ref/gen_vigogne2.txt -------------------------------------------------------------------------------- /src/vs_inc/stdint.h: -------------------------------------------------------------------------------- 1 | typedef __int64 int64_t; 2 | typedef unsigned __int64 uint64_t; 3 | typedef int int32_t; 4 | typedef unsigned int uint32_t; 5 | -------------------------------------------------------------------------------- /src/model/load/load_transformer.h: -------------------------------------------------------------------------------- 1 | // load config 2 | void load_checkpoint_config(void); 3 | 4 | // load checkpoint weights 5 | void load_checkpoint_weights(void); 6 | -------------------------------------------------------------------------------- /doc/linux_build.txt: -------------------------------------------------------------------------------- 1 | Linux build require a linux port for sources: 2 | - term_utf8_w.c 3 | - numa_w.c 4 | - time_ev.c 5 | 6 | Other sources should build without changes for x86 64 arch. 7 | -------------------------------------------------------------------------------- /src/model/load/load_tokenizer.h: -------------------------------------------------------------------------------- 1 | // find merge datas, return NULL if not found (code in load_tokenizer.c) 2 | const struct merge_id_t *bpe_find_merge(int l_id, int r_id); 3 | 4 | // load (private to tokenizer.c, defined in load_tokenizer.c) 5 | void load_tokenizer(const char *file_name); 6 | -------------------------------------------------------------------------------- /tests/dir_info.txt: -------------------------------------------------------------------------------- 1 | Contain speed test with and without data conversion and hyperthreading on/off using various models. 2 | 3 | 1_node: 4 | tested with xeon e5 2680 v4 (14c 2400Mhz), 64Gb ram ddr4 2400 4ch 5 | 6 | 2_nodes: 7 | tested with 2 * xeon e5 2650 v4 (12c 2200Mhz), 128Gb ram ddr4 2400 (64Gb/socket) 8 | -------------------------------------------------------------------------------- /src/matmul/tr_opt_simd.h: -------------------------------------------------------------------------------- 1 | // ----------------------------------------------------- 2 | // simd optimized head attention (code in tr_opt_simd.c) 3 | 4 | typedef void (* head_att_opt_t)(float *xb, int n_tok, float *att, const float *q, const float *k, const float *v, const struct transformer_config_t *p); 5 | 6 | // head attention simd (defined by matmul_init()) 7 | extern head_att_opt_t head_att_opt; 8 | 9 | // init 10 | void init_head_att_opt(enum e_simd_typ simd_typ); 11 | -------------------------------------------------------------------------------- /src/utils/mem_alloc.h: -------------------------------------------------------------------------------- 1 | // ------------------------------------ 2 | // memory allocation with check 3 | 4 | void *malloc_check(size_t size); 5 | void *calloc_check(size_t size); 6 | void *realloc_check(void *ptr, size_t size); 7 | void free_check(void *ptr); 8 | 9 | // print currently allocated size 10 | void dbg_print_alloc(void); 11 | 12 | // alloc string 13 | char *str_alloc(const char *str, int len); 14 | 15 | #define VAR_ALLOC(var, typ, ne) typ *var = (typ *)malloc_check((ne)*sizeof(typ)) 16 | -------------------------------------------------------------------------------- /src/matmul/w_types.h: -------------------------------------------------------------------------------- 1 | // weights data types 2 | enum e_w_type 3 | { 4 | w_type_f32 = 0, 5 | w_type_f16, 6 | w_type_bf16, 7 | w_type_sf16, 8 | w_type_f12, 9 | w_type_f8, 10 | w_type_COUNT, 11 | }; 12 | 13 | // types sizeof 14 | static const unsigned int w_type_sizeof[w_type_COUNT] = { 4, 2, 2, 2, 2, 1 }; 15 | 16 | // names of types (in matmul.c) 17 | extern const char *w_type_name[w_type_COUNT]; 18 | 19 | // C types 20 | typedef unsigned short f16_t; 21 | typedef unsigned short bf16_t; 22 | typedef unsigned short sf16_t; 23 | typedef unsigned short f12_t; 24 | typedef unsigned char f8_t; 25 | -------------------------------------------------------------------------------- /src/utils/utf8.h: -------------------------------------------------------------------------------- 1 | // ------------------------------------ 2 | // UTF8 3 | 4 | // encode one char to utf8, return length, 0 if error 5 | int utf8_char_encode(char *s, int code); 6 | 7 | // return char encoded code value and length 8 | int utf8_char_decode(const char *s, int *code); 9 | 10 | // return char encoded length and test if coding is valid 11 | int utf8_char_len(const char *s); 12 | 13 | // return count of utf8 char coded in string, return 0 if coding error found 14 | int utf8_get_char_count(const char *s); 15 | 16 | // text convert cr + lf or lf alone to cr 17 | bool utf8_cvt_crlf_to_cr(char *s); 18 | -------------------------------------------------------------------------------- /src/dir_info.txt: -------------------------------------------------------------------------------- 1 | directory matmul 2 | contain: 3 | - simd optimized matmul/float conversion code. interface matmul.h 4 | - simd optimized code for transformer self-attention (tr_opt_simd.c) 5 | 6 | directory model 7 | contain: 8 | - tokenizer/transformer/sampler code 9 | 10 | directory model/load 11 | contain: 12 | - json parser 13 | - tokenizer/transformer loader 14 | 15 | directory vs_inc 16 | Visual studio 2012 express was used to build the project, but stdbool.h and stdint.h are not defined as standard headers with this compiler, so a minimal definition is done here. 17 | It is not required for GCC build. 18 | 19 | directory util 20 | contain various utils non-llm specific code 21 | -------------------------------------------------------------------------------- /tests/1_node/res_1socket.txt: -------------------------------------------------------------------------------- 1 | f16/bf16 2 | 3 | llama1 7B fp16 231 5.04 4 | llama2 7B fp16 249 5.09 5 | llama3 8B bf16 268 4.50 6 | llama31 8B bf16 398 4.50 7 | codellama 7B bf16 231 5.10 8 | mistral 7B bf16 331 4.71 9 | zephyr 7B bf16 294 4.77 10 | vigogne2 13B fp16 605 2.57 11 | tinyllama 1.1B bf16 297 31.11 12 | qwen2 7B bf16 543 4.75 13 | qwen2 0.5B bf16 67 55.05 14 | 15 | fp32 16 | 17 | mathstral 7B fp32 231 2.40 18 | 19 | using f12 conv 20 | 21 | llama1 7B fp16 231 6.17 22 | llama2 7B fp16 249 6.21 23 | llama3 8B bf16 268 5.53 24 | llama31 8B bf16 398 5.51 25 | codellama 7B bf16 231 6.26 26 | mistral 7B bf16 396 5.88 27 | zephyr 7B bf16 247 5.95 28 | vigogne2 13B fp16 781 3.15 29 | tinyllama 1.1B bf16 237 37.15 30 | qwen2 7B bf16 543 5.83 31 | 32 | using f8 conv 33 | 34 | mixtral 8x7B bf16 384 4.84 35 | -------------------------------------------------------------------------------- /src/utils/time_ev.h: -------------------------------------------------------------------------------- 1 | // ----------------------------------------- 2 | // debug/optimization: eval operations times 3 | 4 | // enable to eval times with debugger 5 | // #define OPT_EVAL_TIMES 6 | 7 | #ifdef OPT_EVAL_TIMES 8 | 9 | #define MAX_TIMES 16 10 | 11 | struct op_time_t 12 | { 13 | uint64_t t0; 14 | uint64_t t_sum; 15 | int n_call; 16 | }; 17 | 18 | extern struct op_time_t op_time[MAX_TIMES]; 19 | 20 | uint64_t get_time_ctr(void); 21 | 22 | static __inline void tm_stop(int id) 23 | { 24 | struct op_time_t *t = &op_time[id]; 25 | t->t_sum += (get_time_ctr() - t->t0); 26 | t->n_call++; 27 | } 28 | 29 | void tm_print(void); 30 | 31 | #define T_START(id) op_time[id].t0 = get_time_ctr() 32 | #define T_STOP(id) tm_stop(id) 33 | #define T_RESET() memset(op_time, 0, sizeof(op_time)) 34 | #define T_CLR(id) op_time[id].t_sum = 0; op_time[id].n_call = 0 35 | #define T_PRINT() tm_print() 36 | 37 | #else 38 | 39 | #define T_RESET() 40 | #define T_START(id) 41 | #define T_STOP(id) 42 | #define T_CLR(id) 43 | #define T_PRINT() 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/utils/time_ev.c: -------------------------------------------------------------------------------- 1 | // ------------------------------------------- 2 | // time eval 3 | 4 | #include 5 | #include "time_ev.h" 6 | 7 | #ifdef OPT_EVAL_TIMES 8 | 9 | #include 10 | #include 11 | 12 | struct op_time_t op_time[MAX_TIMES] = { 0 }; 13 | 14 | uint64_t time_ctr_freq = 0; 15 | 16 | static void init_time_ctr_freq(void) 17 | { 18 | LARGE_INTEGER freq; 19 | QueryPerformanceFrequency(&freq); 20 | time_ctr_freq = freq.QuadPart; 21 | } 22 | 23 | uint64_t get_time_ctr(void) 24 | { 25 | LARGE_INTEGER ticks; 26 | QueryPerformanceCounter(&ticks); 27 | return ticks.QuadPart; 28 | } 29 | 30 | // print all times 31 | void tm_print(void) 32 | { 33 | int i; 34 | if (!time_ctr_freq) 35 | init_time_ctr_freq(); 36 | 37 | printf("\n----------\ntime list:\n"); 38 | for (i=0; in_call) 42 | printf("time[%d]: nc:%d\t dt:%.4f s\n", i, t->n_call, (double)t->t_sum/time_ctr_freq); 43 | } 44 | printf("----------\n"); 45 | } 46 | 47 | #endif -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 pierrel55 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/model/omp_numa.h: -------------------------------------------------------------------------------- 1 | // omp for numa support 2 | 3 | // thread list numa map 4 | struct numa_thread_map_t 5 | { 6 | int nt_mp; // num threads in main process 7 | int n_threads; 8 | unsigned char tid_to_proc_id[MAX_NUMA_PROCS]; 9 | unsigned char tid_to_node_id[MAX_NUMA_PROCS]; 10 | }; 11 | 12 | extern struct numa_thread_map_t numa_map; 13 | 14 | // get weight data dy split size for y 15 | #define WD_GET_DY(y, dy, wy) ((y + dy) <= wy) ? dy : wy - y 16 | 17 | // return sizeof wd ne elements in bytes (usage required where f12 can be used) 18 | size_t wd_ne_sizeof(const struct w_dat_t *wd, size_t ne); 19 | 20 | // split and alloc weight datas in different memory nodes for numa configurations. 21 | void numa_alloc_wd(struct w_dat_t *wd, int nz, int wy, int wx, enum e_w_type w_type, bool mm_split); 22 | 23 | // copy or load datas to weights for one z unit (layer). 24 | void numa_cpy_wd_z(struct w_dat_t *wd, int z_id, const void *s, file_t *f); 25 | 26 | // init OMP for numa configuration 27 | void numa_init_omp(int cfg_n_procs, int cfg_n_nodes); 28 | 29 | // check omp thread proc match numa_map configuration 30 | void omp_proc_bind_numa_check(void); 31 | -------------------------------------------------------------------------------- /src/utils/term_utf8.h: -------------------------------------------------------------------------------- 1 | // UTF8 terminal 2 | 3 | // define a RGB color using "r.g.b" string, ex: "180.255.180" 4 | int term_get_color(const char *col_str); 5 | 6 | // define user_col[col_id] RGB color, must be done before call to term_init() 7 | void term_def_color(int col_id, int color); 8 | 9 | // set print color 10 | void text_color(int col_id); 11 | 12 | // single init, required if text_color() used 13 | void term_init(void); 14 | 15 | // wait for ms (debug usage) 16 | void term_wait_ms(int ms); 17 | 18 | // print UTF8 string and manage cr alone as cr + lf 19 | bool print_utf8(const char *s); 20 | 21 | // print UTF8 string and display control chars code (debug/check usage) 22 | void print_utf8_raw(const char *s); 23 | 24 | // ensure cursor position at new line 25 | void cursor_nl(void); 26 | void cursor_nl_set(void); 27 | 28 | // keyboard input a string, return utf8 encoded buffer size 29 | int kbd_input_utf8(char *s, int s_sizeof); 30 | 31 | // read a key without wait 32 | int read_key(void); 33 | 34 | // sleep for ms time 35 | void sleep_ms(int ms); 36 | 37 | // clipboard (chat menu), copy utf8 text to clipboard 38 | void term_cb_clear(void); 39 | void term_cb_add_utf8(const char *utf8); 40 | void term_cb_copy(void); 41 | -------------------------------------------------------------------------------- /llama_st.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Express 2012 for Windows Desktop 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "llama_st", "llama_st.vcxproj", "{E88EB555-D508-48F5-8C4E-C3E6C9CC9903}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|Win32 = Debug|Win32 9 | Debug|x64 = Debug|x64 10 | Release|Win32 = Release|Win32 11 | Release|x64 = Release|x64 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Debug|Win32.ActiveCfg = Debug|Win32 15 | {E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Debug|Win32.Build.0 = Debug|Win32 16 | {E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Debug|x64.ActiveCfg = Debug|x64 17 | {E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Debug|x64.Build.0 = Debug|x64 18 | {E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Release|Win32.ActiveCfg = Release|Win32 19 | {E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Release|Win32.Build.0 = Release|Win32 20 | {E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Release|x64.ActiveCfg = Release|x64 21 | {E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Release|x64.Build.0 = Release|x64 22 | EndGlobalSection 23 | GlobalSection(SolutionProperties) = preSolution 24 | HideSolutionNode = FALSE 25 | EndGlobalSection 26 | EndGlobal 27 | -------------------------------------------------------------------------------- /src/utils/numa.h: -------------------------------------------------------------------------------- 1 | // numa informations. 2 | #define MAX_NUMA_PROCS 64 // max supported procs (need to manage processor group if more needed, or can also disable hyperthreading in BIOS) 3 | #define MAX_NUMA_NODES 8 // max supported nodes (can be increased with current code) 4 | 5 | // numa informations 6 | struct numa_inf_t 7 | { 8 | int mt_node; // main thread node 9 | int mt_procs; // main thread node proc count 10 | int n_nodes; // nodes count 11 | int n_procs; // physical processors count 12 | unsigned char proc_list[MAX_NUMA_PROCS]; // procs list batched with same node id 13 | unsigned char proc_node[MAX_NUMA_PROCS]; // node id for each processor in proc_list 14 | unsigned char node_nprocs[MAX_NUMA_NODES]; // proc count in each node 15 | }; 16 | 17 | // global numa informations, use as read only 18 | extern struct numa_inf_t numa; 19 | 20 | // init numa struct 21 | void init_numa_info(void); 22 | 23 | // display mem available in nodes 24 | void numa_disp_mem(void); 25 | 26 | // set proc for current thread 27 | bool numa_set_thread_proc(int proc_id); 28 | 29 | // return proc for current thread 30 | int numa_get_thread_proc(void); 31 | 32 | // -------------------------- 33 | // memory alloc/free 34 | 35 | // reserve physical memory in node 36 | void *numa_alloc(size_t sz, int node); 37 | 38 | // free memory allocated with numa_alloc 39 | void numa_free(void *p); 40 | -------------------------------------------------------------------------------- /src/main.c: -------------------------------------------------------------------------------- 1 | #include "l_util.h" 2 | #include "model.h" 3 | 4 | #ifdef CHECK_EXIT 5 | #include "mem_alloc.h" 6 | #include "omp_numa.h" 7 | #endif 8 | 9 | int main(int argc, char *argv[]) 10 | { 11 | if (APP_ERROR()) // catch error return point 12 | return -1; 13 | 14 | #if 1 15 | if (argc != 2) 16 | { 17 | msg_info("Usage: llama_st \n"); 18 | msg_info("Example: llama_st run_json/run_llama2.json\n"); 19 | return -1; 20 | } 21 | build_model(argv[1]); 22 | #else 23 | // dev mode 24 | build_model("run_json/run_tinyllama.json"); 25 | //build_model("run_json/run_llama1.json"); 26 | //build_model("run_json/run_llama2.json"); 27 | //build_model("run_json/run_codellama.json"); 28 | //build_model("run_json/run_mistral.json"); 29 | //build_model("run_json/run_mathstral.json"); 30 | //build_model("run_json/run_zephyr.json"); 31 | //build_model("run_json/run_mixtral.json"); 32 | //build_model("run_json/run_vigogne2.json"); 33 | //build_model("run_json/run_llama3.json"); 34 | //build_model("run_json/run_llama3.1.json"); 35 | //build_model("run_json/run_qwen2.5.json"); 36 | #endif 37 | 38 | // run generate or chat 39 | if (model.config.run_mode == run_mode_generate) 40 | generate(); 41 | else 42 | if (model.config.run_mode == run_mode_chat) 43 | chat(); 44 | else 45 | msg_info("undefined run mode: %d\n", model.config.run_mode); 46 | 47 | // free memory 48 | free_model(); 49 | 50 | #ifdef CHECK_EXIT 51 | // some exit checks 52 | omp_proc_bind_numa_check(); // check no change occured 53 | dbg_print_alloc(); // check free 54 | wait_return_exit(); // press return to exit 55 | #endif 56 | 57 | return 0; 58 | } 59 | -------------------------------------------------------------------------------- /src/matmul/matmul_f32.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "mm_hsum.h" 3 | #include "w_types.h" 4 | #include "matmul.h" 5 | 6 | // ------------------------------------------------------------------ 7 | // f32 * f32 => f32 8 | // ------------------------------------------------------------------ 9 | 10 | static void matmul_f32_f32_fpu(float *res, const float *vec, const float *mat, int len_vec, int y_mat) 11 | { 12 | const float *m, *m_end = mat + y_mat * len_vec; 13 | for (m=mat; m!=m_end; m+=len_vec) 14 | { 15 | float acc = 0; 16 | int i; 17 | for (i=0; i!=len_vec; i++) 18 | acc += vec[i] * m[i]; 19 | *res++ = acc; 20 | } 21 | } 22 | 23 | static void matmul_f32_f32_sse(float *res, const float *vec, const float *mat, int len_vec, int y_mat) 24 | { 25 | const float *m, *m_end = mat + y_mat * len_vec; 26 | for (m=mat; m!=m_end; m+=len_vec) 27 | { 28 | __m128 acc = _mm_setzero_ps(); // init 0 in sum 29 | int i; 30 | for (i=0; i!=len_vec; i+=4) 31 | acc = _mm_fmadd_ps(_mm_load_ps(vec + i), _mm_load_ps(m + i), acc); 32 | *res++ = hsum_ps_sse(acc); 33 | } 34 | } 35 | 36 | static void matmul_f32_f32_avx1(float *res, const float *vec, const float *mat, int len_vec, int y_mat) 37 | { 38 | const float *m, *m_end = mat + y_mat * len_vec; 39 | for (m=mat; m!=m_end; m+=len_vec) 40 | { 41 | __m256 acc = _mm256_setzero_ps(); 42 | int i; 43 | for (i=0; i!=len_vec; i+=8) 44 | acc = _mm256_fmadd_ps(_mm256_load_ps(vec + i), _mm256_load_ps(m + i), acc); 45 | *res++ = hsum_ps_avx1(acc); 46 | } 47 | } 48 | 49 | // init functions list 50 | const matmul_f32_f32_t matmul_f32_f32_procs[simd_n] = 51 | { 52 | matmul_f32_f32_fpu, 53 | matmul_f32_f32_sse, 54 | matmul_f32_f32_avx1, 55 | NULL, 56 | }; 57 | -------------------------------------------------------------------------------- /tests/1_node/gen_ref/gen_codellama.txt: -------------------------------------------------------------------------------- 1 | read file run_json/run_codellama.json 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 3 | conv/matmul AVX2 checks done. 4 | load tokenizer: E:/codellama/codellama-7b-instruct-hf/tokenizer.json 5 | load transformer.. 6 | read model config in: E:/codellama/codellama-7b-instruct-hf/config.json 7 | torch float type: bf16 8 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 9 | processor(s) core(s) used: 12 in 1 node(s). 10 | load: E:/codellama/codellama-7b-instruct-hf/model-00001-of-00002.safetensors 11 | load: E:/codellama/codellama-7b-instruct-hf/model-00002-of-00002.safetensors 12 | sampler config: 13 | temperature : 0.60 14 | topp : 0.90 15 | topk : 30 16 | topp_minp : 0.05 17 | topp_eos : true 18 | repeat_penalty : 0.00 19 | repeat_penalty_n : 50 20 | eos_amp : 0.50 21 | eos_amp_n : 150 22 | rand seed : 1234 23 | Generate: max 16384 tokens.. 24 | - Press 'esc' key to break generation. 25 | bool is_prime(int x) 26 | { 27 | if (x < 2) 28 | return false; 29 | 30 | if (x % 2 == 0) 31 | return x == 2; 32 | 33 | int root = (int)std::sqrt(x); 34 | for (int i = 3; i <= root; i += 2) 35 | if (x % i == 0) 36 | return false; 37 | 38 | return true; 39 | } 40 | 41 | int main() 42 | { 43 | int n; 44 | cin >> n; 45 | 46 | vector nums(n); 47 | for (int i = 0; i < n; ++i) 48 | { 49 | cin >> nums[i]; 50 | } 51 | 52 | int count = 0; 53 | for (int i = 0; i < n; ++i) 54 | { 55 | if (is_prime(nums[i])) 56 | count++; 57 | } 58 | 59 | cout << count; 60 | 61 | return 0; 62 | } 63 | total time: 45.26s for 231 tokens, tok/s: 5.10 64 | Press any key to continue . . . -------------------------------------------------------------------------------- /src/generate.c: -------------------------------------------------------------------------------- 1 | #include "l_util.h" 2 | #include "model.h" 3 | #include "term_utf8.h" 4 | #include "time_ev.h" 5 | 6 | // re-use function defined in chat.c 7 | void tokenizer_decode_print_ex(int token_id, float prob); 8 | 9 | // generation loop 10 | void generate(void) 11 | { 12 | int i, t0, t1, n_gen, run_steps; 13 | struct run_conf_t *conf = &model.config; 14 | const struct mt_list_t *mt_list = &model.tokenizer.mt_list; 15 | 16 | msg_info("Generate: max %d tokens..\n", conf->gen_run_steps); 17 | msg_info("- Press 'esc' key to break generation.\n"); 18 | T_RESET(); // dev mode, eval code time 19 | 20 | // time stats 21 | t0 = time_in_ms(); 22 | 23 | // forward init prompt 24 | tokenizer_encode(conf->gen_mode_prompt); 25 | for (i=0; in_list; i++) 26 | { 27 | int token = mt_list->mt[i].tok_id; 28 | forward(token, false, i == (mt_list->n_list-1)); 29 | tokenizer_decode_print_ex(token, -1.0f); 30 | } 31 | 32 | // generate 33 | for (run_steps = 0; run_steps != conf->gen_run_steps; run_steps++) 34 | { 35 | // get token from logits into samp 36 | struct prob_index_t *pi = sampler_sample(); 37 | 38 | // stop gen is s key pressed 39 | if (read_key() == 27) 40 | { 41 | msg_info("{esc stop}"); 42 | break; 43 | } 44 | 45 | // data-dependent terminating condition 46 | if ( (pi->index == model.config.token_eos) 47 | || (pi->index == model.config.token_eot)) 48 | break; 49 | 50 | // print generated token 51 | tokenizer_decode_print_ex(pi->index, pi->prob); 52 | 53 | // update logits 54 | forward(pi->index, true, true); 55 | } 56 | 57 | T_PRINT(); 58 | 59 | // time elapsed 60 | t1 = time_in_ms(); 61 | n_gen = model.transformer.state.cache.n_tokens; 62 | msg_info("\ntotal time: %.2fs for %d tokens, tok/s: %.2f\n", (t1-t0) / 1000.0, n_gen, n_gen*1000.0 / (t1-t0)); 63 | } 64 | -------------------------------------------------------------------------------- /tests/1_node/gen_f12/gen_codellama.txt: -------------------------------------------------------------------------------- 1 | read file run_json/run_codellama.json 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 3 | conv/matmul AVX2 checks done. 4 | load tokenizer: E:/codellama/codellama-7b-instruct-hf/tokenizer.json 5 | load transformer.. 6 | read model config in: E:/codellama/codellama-7b-instruct-hf/config.json 7 | torch float type: bf16 8 | model weights converted to float12. 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 10 | processor(s) core(s) used: 12 in 1 node(s). 11 | load: E:/codellama/codellama-7b-instruct-hf/model-00001-of-00002.safetensors 12 | load: E:/codellama/codellama-7b-instruct-hf/model-00002-of-00002.safetensors 13 | sampler config: 14 | temperature : 0.60 15 | topp : 0.90 16 | topk : 30 17 | topp_minp : 0.05 18 | topp_eos : true 19 | repeat_penalty : 0.00 20 | repeat_penalty_n : 50 21 | eos_amp : 0.50 22 | eos_amp_n : 150 23 | rand seed : 1234 24 | Generate: max 16384 tokens.. 25 | - Press 'esc' key to break generation. 26 | bool is_prime(int x) 27 | { 28 | if (x < 2) 29 | return false; 30 | 31 | if (x % 2 == 0) 32 | return x == 2; 33 | 34 | int root = (int)std::sqrt(x); 35 | for (int i = 3; i <= root; i += 2) 36 | if (x % i == 0) 37 | return false; 38 | 39 | return true; 40 | } 41 | 42 | int main() 43 | { 44 | int n; 45 | cin >> n; 46 | 47 | vector nums(n); 48 | for (int i = 0; i < n; ++i) 49 | { 50 | cin >> nums[i]; 51 | } 52 | 53 | int count = 0; 54 | for (int i = 0; i < n; ++i) 55 | { 56 | if (is_prime(nums[i])) 57 | count++; 58 | } 59 | 60 | cout << count; 61 | 62 | return 0; 63 | } 64 | total time: 36.91s for 231 tokens, tok/s: 6.26 65 | Press any key to continue . . . -------------------------------------------------------------------------------- /src/matmul/matmul_priv.h: -------------------------------------------------------------------------------- 1 | // common private header for matmul/conversion code 2 | #define N_64K (1 << 16) 3 | 4 | #define ABS_F16(x) ((x) & 0x7FFF) // abs for F16/BF16/SF16, clear sign bit 5 | 6 | // -------------------------------------- 7 | // data conversion to float 32 functions 8 | 9 | extern const cvt_f16_to_f32_t cvt_f16_to_f32_procs[simd_n]; 10 | extern const cvt_bf16_to_f32_t cvt_bf16_to_f32_procs[simd_n]; 11 | extern const cvt_sf16_to_f32_t cvt_sf16_to_f32_procs[simd_n]; 12 | 13 | // -------------------------------------- 14 | // vector to matrix multiply functions 15 | 16 | extern const matmul_f32_f32_t matmul_f32_f32_procs[simd_n]; 17 | extern const matmul_f32_f16_t matmul_f32_f16_procs[simd_n]; 18 | extern const matmul_f32_bf16_t matmul_f32_bf16_procs[simd_n]; 19 | extern const matmul_f32_sf16_t matmul_f32_sf16_procs[simd_n]; 20 | extern const matmul_f32_f12_t matmul_f32_f12_procs[simd_n]; 21 | extern const matmul_f32_f8_t matmul_f32_f8_procs[simd_n]; 22 | 23 | // -------------------------------------- 24 | // SF16 conversions, code in matmul_sf16.c 25 | 26 | void init_conv_sf16(void); 27 | void cvt_f16_to_sf16(sf16_t *sf16, const f16_t *f16, size_t ne); 28 | 29 | // -------------------------------------- 30 | // F16 conversions, code in matmul_f16.c 31 | 32 | void init_sw_f16c(void); 33 | void free_sw_f16c(void); 34 | void cvt_f32_to_f16(f16_t *f16, const float *f32, size_t ne); 35 | 36 | // -------------------------------------- 37 | // F12 conversions, code in matmul_f12.c 38 | 39 | void init_conv_f12(void); 40 | void cvt_f16_to_f12(f12_t *f12, const f16_t *f16, size_t ne); 41 | void cvt_bf16_to_f12(f12_t *f12, const f16_t *f16, size_t ne); 42 | 43 | // -------------------------------------- 44 | // F8 conversions, code in matmul_f8.c 45 | 46 | void init_conv_f8(void); 47 | void cvt_f16_to_f8(f8_t *f8, const f16_t *f16, size_t ne); 48 | void cvt_bf16_to_f8(f8_t *f8, const bf16_t *bf16, size_t ne); -------------------------------------------------------------------------------- /src/model/sampler.h: -------------------------------------------------------------------------------- 1 | // ---------------------------------------------------------------------------- 2 | // The struct sampler_t, which takes logits and returns a sampled token 3 | // sampling can be done in a few ways: greedy argmax, sampling, top-p sampling 4 | 5 | // struct used when sorting probabilities during top-p sampling 6 | struct prob_index_t 7 | { 8 | float prob; 9 | int index; 10 | }; 11 | 12 | // sampler config 13 | struct sampler_conf_t 14 | { 15 | float temperature; // 0.0 to 2.0: 0:greedy decoding 2.0:maximum creativity 16 | float topp; // 0.01 to 0.99: max probability sum of top tokens 17 | int topk; // (integer) limit size of top tokens list 5..200 (0 = disable) 18 | float topp_minp; // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 19 | bool topp_eos; // true: limit topp list size to token with probability >= EOS 20 | float repeat_penalty; // 0.0..2.0 repeat penalty (0.0 = disable) 21 | int repeat_penalty_n; // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 22 | float eos_amp; // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable) 23 | int eos_amp_n; // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 24 | int rand_seed; // (integer) random seed 25 | char *ch_restrict; // if string defined, define ascii + allowed chars list in sampled tokens. 26 | }; 27 | 28 | struct sampler_t 29 | { 30 | struct sampler_conf_t conf; // config 31 | 32 | uint64_t rng_state; 33 | struct prob_index_t *probindex; // buffer used in top-p sampling 34 | int *tk_select; // binary array for restricted tokens, (NULL if unused) 35 | }; 36 | 37 | void build_sampler(void); 38 | void free_sampler(void); 39 | 40 | // sample from transformer logits 41 | struct prob_index_t *sampler_sample(void); 42 | -------------------------------------------------------------------------------- /tests/1_node/gen_ref/gen_llama1.txt: -------------------------------------------------------------------------------- 1 | read file run_json/run_llama1.json 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 3 | conv/matmul AVX2 checks done. 4 | load tokenizer: D:/llama1_st/7b/tokenizer.json 5 | load transformer.. 6 | read model config in: D:/llama1_st/7b/config.json 7 | n_kv_heads undefined, assumed = n_heads (32) 8 | rope_theta undefined, expect rotary_emb.inv_freq contained in .safetensors 9 | torch float type: fp16 10 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 11 | processor(s) core(s) used: 12 in 1 node(s). 12 | load: D:/llama1_st/7b/model-00001-of-00002.safetensors 13 | load: D:/llama1_st/7b/model-00002-of-00002.safetensors 14 | sampler config: 15 | temperature : 0.70 16 | topp : 0.75 17 | topk : 25 18 | topp_minp : 0.05 19 | topp_eos : true 20 | repeat_penalty : 0.05 21 | repeat_penalty_n : 50 22 | eos_amp : 0.50 23 | eos_amp_n : 150 24 | rand seed : 1234 25 | Generate: max 2048 tokens.. 26 | - Press 'esc' key to break generation. 27 | The explanation for the existence of seasons is a bit more complicated than the 28 | simple explanation that the Earth is tilted. It's actually a combination of the 29 | Earth's tilt and its orbit around the Sun. 30 | The Earth's axis is tilted at an angle of 23.5 degrees to the plane of its orbit 31 | around the Sun. This means that the northern hemisphere is tilted toward the Su 32 | n in summer and away from the Sun in winter. The southern hemisphere is tilted a 33 | way from the Sun in summer and toward the Sun in winter. 34 | The Earth's orbit around the Sun is not a perfect circle. Instead, it's an ellip 35 | se, or a stretched-out circle. The Sun is not at the center of the ellipse. It's 36 | at one of the foci, or the two points where the ellipse is widest. 37 | The Earth is closer to the Sun in January than in July. The distance between the 38 | Earth and the Sun varies from about 91 million miles in January to about 94 mil 39 | lion miles in July. 40 | total time: 45.83s for 231 tokens, tok/s: 5.04 41 | Press any key to continue . . . -------------------------------------------------------------------------------- /tests/1_node/gen_f12/gen_llama1.txt: -------------------------------------------------------------------------------- 1 | read file run_json/run_llama1.json 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 3 | conv/matmul AVX2 checks done. 4 | load tokenizer: D:/llama1_st/7b/tokenizer.json 5 | load transformer.. 6 | read model config in: D:/llama1_st/7b/config.json 7 | n_kv_heads undefined, assumed = n_heads (32) 8 | rope_theta undefined, expect rotary_emb.inv_freq contained in .safetensors 9 | torch float type: fp16 10 | model weights converted to float12. 11 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 12 | processor(s) core(s) used: 12 in 1 node(s). 13 | load: D:/llama1_st/7b/model-00001-of-00002.safetensors 14 | load: D:/llama1_st/7b/model-00002-of-00002.safetensors 15 | sampler config: 16 | temperature : 0.70 17 | topp : 0.75 18 | topk : 25 19 | topp_minp : 0.05 20 | topp_eos : true 21 | repeat_penalty : 0.05 22 | repeat_penalty_n : 50 23 | eos_amp : 0.50 24 | eos_amp_n : 150 25 | rand seed : 1234 26 | Generate: max 2048 tokens.. 27 | - Press 'esc' key to break generation. 28 | The explanation for the existence of seasons is a bit more complicated than the 29 | simple explanation that the Earth is tilted. It's actually a combination of the 30 | Earth's tilt and its orbit around the Sun. 31 | The Earth's axis is tilted at an angle of 23.5 degrees to the plane of its orbit 32 | around the Sun. This means that the northern hemisphere is tilted toward the Su 33 | n in summer and away from the Sun in winter. The southern hemisphere is tilted a 34 | way from the Sun in summer and toward the Sun in winter. 35 | The Earth's orbit around the Sun is not a perfect circle. Instead, it's an ellip 36 | se, or a stretched-out circle. The Sun is not at the center of the ellipse. It's 37 | at one of the foci, or the two points where the ellipse is widest. 38 | The Earth is closer to the Sun in January than in July. The distance between the 39 | Earth and the Sun varies from about 91 million miles in January to about 94 mil 40 | lion miles in July. 41 | total time: 37.41s for 231 tokens, tok/s: 6.17 42 | Press any key to continue . . . -------------------------------------------------------------------------------- /tests/1_node/gen_f12/gen_tinyllama.txt: -------------------------------------------------------------------------------- 1 | read file run_json/run_tinyllama.json 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 3 | conv/matmul AVX2 checks done. 4 | load tokenizer: E:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0/tokenizer.json 5 | load transformer.. 6 | read model config in: E:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0/config.json 7 | torch float type: bf16 8 | model weights converted to float12. 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 10 | processor(s) core(s) used: 12 in 1 node(s). 11 | load: E:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0/model.safetensors 12 | sampler config: 13 | temperature : 0.60 14 | topp : 0.65 15 | topk : 25 16 | topp_minp : 0.05 17 | topp_eos : true 18 | repeat_penalty : 0.05 19 | repeat_penalty_n : 50 20 | eos_amp : 0.50 21 | eos_amp_n : 150 22 | rand seed : 1234 23 | Generate: max 2048 tokens.. 24 | - Press 'esc' key to break generation. 25 | The explanation for the existence of seasons is a complex and multifaceted proce 26 | ss that involves the Earth's orbit around the Sun, the Earth's rotation around t 27 | he Sun, and the Earth's position relative to the Sun's position in space. The pr 28 | ocess of seasons is driven by the Earth's orbit around the Sun, which causes the 29 | Earth's axis to tilt slightly relative to the direction of the Sun's travel. Th 30 | is tilt causes the Earth's surface to receive more or less solar radiation depen 31 | ding on the season. The Earth's rotation around the Sun also causes the Earth's 32 | magnetic field to vary, which affects the Earth's magnetic field and the way tha 33 | t energy is transported through the atmosphere. The Earth's position relative to 34 | the Sun's position in space also affects the amount of energy that is absorbed 35 | and released by the Earth's atmosphere. Overall, the complex and multifaceted pr 36 | ocess of seasons is driven by the Earth's orbit around the Sun, the Earth's rota 37 | tion around the Sun, and the Earth's position relative to the Sun's position in 38 | space 39 | total time: 6.38s for 237 tokens, tok/s: 37.15 40 | Press any key to continue . . . -------------------------------------------------------------------------------- /src/matmul/mm_hsum.h: -------------------------------------------------------------------------------- 1 | #ifdef VS_2008 2 | // old compiler. some SSE not defined in intrin. (_mm_cvtph_ps/_mm_cvtps_ph/_mm_fmadd_ps etc..) 3 | // this use external linking. (slow but work) 4 | #include "conv_ph_ps.h" 5 | #endif 6 | 7 | #ifndef MM_USE_FMA 8 | #define _mm_fmadd_ps(a,b,c) _mm_add_ps(c,_mm_mul_ps(a,b)) 9 | #define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(c,_mm256_mul_ps(a,b)) 10 | #endif 11 | 12 | // ------------------------------------------------------------------ 13 | // SSE3/AVX horizontal sum 14 | // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction 15 | // ------------------------------------------------------------------ 16 | 17 | #if 1 18 | 19 | static __inline float hsum_ps_sse(__m128 v) 20 | { 21 | __m128 shuf = _mm_movehdup_ps(v); // broadcast elements 3,1 to 2,0 22 | __m128 sums = _mm_add_ps(v, shuf); 23 | shuf = _mm_movehl_ps(shuf, sums); // high half -> low half 24 | sums = _mm_add_ss(sums, shuf); 25 | return _mm_cvtss_f32(sums); 26 | } 27 | 28 | static __inline float hsum_ps_avx1(__m256 v) 29 | { 30 | __m128 vlow = _mm256_castps256_ps128(v); 31 | __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128 32 | vlow = _mm_add_ps(vlow, vhigh); // add the low 128 33 | return hsum_ps_sse(vlow); // and inline the sse3 version, which is optimal for AVX 34 | } 35 | 36 | #else 37 | 38 | // FPU. 39 | // note: can be faster than AVX/SSE versions (compiler optimized ?). 40 | 41 | static __inline float hsum_ps_sse(__m128 v) 42 | { 43 | float *sum_4 = (float *)&v; 44 | return sum_4[0] + sum_4[1] + sum_4[2] + sum_4[3]; 45 | } 46 | 47 | static __inline float hsum_ps_avx1(__m256 v) 48 | { 49 | float *sum_8 = (float *)&v; 50 | return sum_8[0] + sum_8[1] + sum_8[2] + sum_8[3] + sum_8[4] + sum_8[5] + sum_8[6] + sum_8[7]; 51 | } 52 | 53 | #endif 54 | 55 | #define hsum_ps_sse_2x(a,b) hsum_ps_sse(_mm_add_ps(a,b)) 56 | #define hsum_ps_sse_4x(a,b,c,d) hsum_ps_sse(_mm_add_ps(_mm_add_ps(a,b),_mm_add_ps(c,d))) 57 | 58 | #define hsum_ps_avx_2x(a,b) hsum_ps_avx1(_mm256_add_ps(a,b)) 59 | #define hsum_ps_avx_4x(a,b,c,d) hsum_ps_avx1(_mm256_add_ps(_mm256_add_ps(a,b),_mm256_add_ps(c,d))) 60 | 61 | -------------------------------------------------------------------------------- /tests/1_node/gen_ref/gen_llama2.txt: -------------------------------------------------------------------------------- 1 | read file run_json/run_llama2.json 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 3 | conv/matmul AVX2 checks done. 4 | load tokenizer: E:/llama2/llama2-7b-chat-hf/tokenizer.json 5 | load transformer.. 6 | read model config in: E:/llama2/llama2-7b-chat-hf/config.json 7 | rope_theta undefined, expect rotary_emb.inv_freq contained in .safetensors 8 | torch float type: fp16 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 10 | processor(s) core(s) used: 12 in 1 node(s). 11 | load: E:/llama2/llama2-7b-chat-hf/model-00001-of-00002.safetensors 12 | load: E:/llama2/llama2-7b-chat-hf/model-00002-of-00002.safetensors 13 | sampler config: 14 | temperature : 0.90 15 | topp : 0.80 16 | topk : 40 17 | topp_minp : 0.05 18 | topp_eos : true 19 | repeat_penalty : 0.05 20 | repeat_penalty_n : 50 21 | eos_amp : 0.50 22 | eos_amp_n : 150 23 | rand seed : 1234 24 | Generate: max 4096 tokens.. 25 | - Press 'esc' key to break generation. 26 | The explanation for the existence of seasons is due to the tilt of the Earth's a 27 | xis and its orbit around the sun. 28 | 29 | The Earth's axis is tilted at an angle of approximately 23.5 degrees relative to 30 | the plane of its orbit around the sun. This means that, as the Earth orbits the 31 | sun, different parts of the planet are tilted towards or away from the sun, res 32 | ulting in changes in the amount of sunlight that reaches the Earth's surface. 33 | 34 | During the summer months in the Northern Hemisphere, the Earth is tilted towards 35 | the sun, resulting in longer days and more direct sunlight reaching the surface 36 | . This leads to warmer temperatures and longer days. 37 | 38 | In contrast, during the winter months in the Northern Hemisphere, the Earth is t 39 | ilted away from the sun, resulting in shorter days and less direct sunlight reac 40 | hing the surface. This leads to colder temperatures and shorter days. 41 | 42 | The same process occurs in the Southern Hemisphere, but with the opposite season 43 | s. When it is summer in the Northern Hemisphere, it is winter in the Southern He 44 | misphere, and vice versa. 45 | total time: 48.91s for 249 tokens, tok/s: 5.09 46 | Press any key to continue . . . -------------------------------------------------------------------------------- /src/model/tr_opt_inc.c: -------------------------------------------------------------------------------- 1 | // this file must be included in transformer.c if USE_THRD_BATCH defined. 2 | // batch some tread work, allow to gain about 2 to 6 % speed 3 | 4 | #ifdef INC_THRD_BATCH 5 | 6 | // define qkv for self attention 7 | static _inline void opt_compute_qkv(float *q, float *k, float *v, const float *xb, const struct transformer_weights_t *w, int layer_id, mm_proc_t matmul_lw) 8 | { 9 | int n_thrd = numa_map.n_threads; 10 | int i; 11 | CHECK(n_thrd <= w->wk.wy); 12 | 13 | #pragma omp parallel for 14 | for (i=0; iwk.dy; 19 | int dy = WD_GET_DY(y, w->wk.dy, w->wk.wy); 20 | size_t ofs; 21 | 22 | lp = &w->wk.lp[i]; 23 | ofs = (size_t)layer_id * lp->sz_l; // same for k and v (same wy) 24 | _p = (const char *)lp->p + ofs; 25 | matmul_lw(k + y, xb, _p, w->wk.wx, dy); 26 | 27 | lp = &w->wv.lp[i]; 28 | _p = (const char *)lp->p + ofs; 29 | matmul_lw(v + y, xb, _p, w->wv.wx, dy); 30 | 31 | if (q) 32 | { 33 | y = i*w->wq.dy; 34 | dy = WD_GET_DY(y, w->wq.dy, w->wq.wy); 35 | lp = &w->wq.lp[i]; 36 | _p = (const char *)lp->p + (size_t)layer_id * lp->sz_l; 37 | matmul_lw(q + y, xb, _p, w->wq.wx, dy); 38 | } 39 | } 40 | } 41 | 42 | // in xb, work hb2, out hb 43 | static _inline void opt_compute_w1_w3_swiglu(float *hb, float *hb2, const float *xb, const struct transformer_weights_t *w, int layer_id, mm_proc_t matmul_lw) 44 | { 45 | int n_thrd = numa_map.n_threads; 46 | int i; 47 | CHECK(n_thrd <= w->w1.wy); 48 | 49 | #pragma omp parallel for 50 | for (i=0; iw1.dy; 55 | int dy = WD_GET_DY(y, w->w1.dy, w->w1.wy); 56 | int wx = w->w1.wx; 57 | int x, x1; 58 | size_t ofs; 59 | 60 | lp = &w->w1.lp[i]; 61 | ofs = (size_t)layer_id * lp->sz_l; // same for w1/w3 (same wy) 62 | _p = (const char *)lp->p + ofs; 63 | matmul_lw(hb + y, xb, _p, wx, dy); 64 | 65 | lp = &w->w3.lp[i]; 66 | _p = (const char *)lp->p + ofs; 67 | matmul_lw(hb2 + y, xb, _p, wx, dy); 68 | 69 | // swiglu 70 | x1 = y+dy; 71 | for (x=y; x not coded because this conversion in used only in matmul_f32_f12_procs / matmul_f32_f8_procs 31 | 32 | // ----------------------------------- 33 | // vector to matrix multiply functions 34 | 35 | // float32 * float32 => float32 36 | typedef void (* matmul_f32_f32_t)(float *res, const float *vec, const float *mat, int len_vec, int y_mat); 37 | 38 | // float32 * float16 => float32 39 | typedef void (* matmul_f32_f16_t)(float *res, const float *vec, const f16_t *mat, int len_vec, int y_mat); 40 | 41 | // float32 * bfloat16 => float32 42 | typedef void (* matmul_f32_bf16_t)(float *res, const float *vec, const bf16_t *mat, int len_vec, int y_mat); 43 | 44 | // float32 * sfloat16 => float32 45 | typedef void (* matmul_f32_sf16_t)(float *res, const float *vec, const sf16_t *mat, int len_vec, int y_mat); 46 | 47 | // float32 * float12 => float32 48 | typedef void (* matmul_f32_f12_t)(float *res, const float *vec, const f12_t *mat, int len_vec, int y_mat); 49 | 50 | // float32 * float8 => float32 51 | typedef void (* matmul_f32_f8_t)(float *res, const float *vec, const f8_t *mat, int len_vec, int y_mat); 52 | 53 | // list of functions 54 | struct matmul_procs_t 55 | { 56 | // convert 57 | cvt_f16_to_f32_t cvt_f16_to_f32; 58 | cvt_bf16_to_f32_t cvt_bf16_to_f32; 59 | cvt_sf16_to_f32_t cvt_sf16_to_f32; 60 | cvt_f12_to_f32_t cvt_f12_to_f32; 61 | 62 | // matmul 63 | matmul_f32_f32_t matmul_f32_f32; 64 | matmul_f32_f16_t matmul_f32_f16; 65 | matmul_f32_bf16_t matmul_f32_bf16; 66 | matmul_f32_sf16_t matmul_f32_sf16; 67 | matmul_f32_f12_t matmul_f32_f12; 68 | matmul_f32_f8_t matmul_f32_f8; 69 | 70 | // infos 71 | enum e_simd_typ simd_set; // initialized mode 72 | int cpu_f16c; // 1: f16c support 73 | }; 74 | 75 | // interface 76 | extern struct matmul_procs_t matmul_procs; 77 | 78 | // generic data types conversions 79 | void cvt_w_data(void *d, enum e_w_type d_type, const void *s, enum e_w_type s_type, size_t ne); 80 | 81 | // init 82 | void matmul_init(enum e_simd_typ simd_typ); 83 | 84 | // free some memory 85 | void matmul_exit(void); 86 | -------------------------------------------------------------------------------- /src/model/tokenizer.h: -------------------------------------------------------------------------------- 1 | // strings dictionnary 2 | struct str_dic_t 3 | { 4 | char *buff; // all strings 0 ended concatenated 5 | int sz_alloc; // buff alloc size in bytes 6 | int wr_ofs; // write offset in buff 7 | int n_strings; // count of strings in buff 8 | }; 9 | 10 | // token 11 | struct tok_index_t 12 | { 13 | const char *str; // to string in tokenizer_t dic_tokens 14 | int tok_id; // origin tokenizer index (non sorted list) 15 | int id_to_sort; // origin index to sorted position (tok_id to str convert) 16 | }; 17 | 18 | // merge tokens id 19 | struct merge_id_t 20 | { 21 | int tok_id_l; // left token id 22 | int tok_id_r; // right token id 23 | int tok_id_m; // merged token id 24 | int merge_id; // id in merge list 25 | }; 26 | 27 | // token list element used by tokenizer encode 28 | struct m_tok_t 29 | { 30 | int score; // merge score with righ token 31 | int tok_id; // token id 32 | int tok_id_m; // token id if merge with righ token 33 | }; 34 | 35 | // merge tokens id list 36 | struct mt_list_t 37 | { 38 | struct m_tok_t *mt; 39 | int n_list; 40 | int n_alloc; 41 | }; 42 | 43 | // BPE tokenizer datas 44 | struct tokenizer_t 45 | { 46 | bool mode_ll3; // llama3 tokenizer mode 47 | 48 | struct str_dic_t dic_tokens; // token string list 49 | struct tok_index_t *tok_index; // token id list 50 | int tok_index_list_size; 51 | 52 | // merge list 53 | struct 54 | { 55 | struct merge_id_t *id_list; // merge tokens id 56 | int list_size; 57 | int n_alloc; 58 | } merge; 59 | 60 | int id_special_base; // start index of special tokens not in model.vocab list 61 | int id_special_last; // last special tokens index 62 | int id_special_count; // count of special tokens 63 | 64 | // byte fallback + strips leading whitespace specific (see PR #89), active if not mode_ll3 65 | int token_id_bos_ws; 66 | int token_id_0x0; // <0x00> byte fallback 67 | int token_id_0xff; // <0xff> byte fallback 68 | 69 | // tokenizer_encode token list result 70 | struct mt_list_t mt_list; 71 | }; 72 | 73 | // find a token id from utf8 string, return -1 if not found 74 | int tokenizer_find_token_id(const char *str); 75 | 76 | // find a special token from string and check is special token 77 | int tokenizer_find_sp_token_id(const char *str); 78 | 79 | // return token string from token index 80 | const char *tokenizer_get_token_str(int token_id); 81 | 82 | // encode text, define mt_list token list 83 | void tokenizer_encode(const char *text); 84 | 85 | // return decoded token string. 86 | const char *tokenizer_decode(int token_id); 87 | 88 | // decode and print 89 | void tokenizer_decode_print(int token_id, bool disp_raw); 90 | 91 | // load and init tokenizer from .json file 92 | void build_tokenizer(void); 93 | 94 | // free allocated mem 95 | void free_tokenizer(void); 96 | -------------------------------------------------------------------------------- /tests/1_node/gen_f12/gen_llama31.txt: -------------------------------------------------------------------------------- 1 | read file run_json/run_llama3.1.json 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 3 | conv/matmul AVX2 checks done. 4 | load tokenizer: D:/llama3.1_st/8b-instruct/tokenizer.json 5 | load transformer.. 6 | read model config in: D:/llama3.1_st/8b-instruct/config.json 7 | torch float type: bf16 8 | model weights converted to float12. 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 10 | processor(s) core(s) used: 12 in 1 node(s). 11 | load: D:/llama3.1_st/8b-instruct/model-00001-of-00004.safetensors 12 | load: D:/llama3.1_st/8b-instruct/model-00002-of-00004.safetensors 13 | load: D:/llama3.1_st/8b-instruct/model-00003-of-00004.safetensors 14 | load: D:/llama3.1_st/8b-instruct/model-00004-of-00004.safetensors 15 | sampler config: 16 | temperature : 0.90 17 | topp : 0.85 18 | topk : 30 19 | topp_minp : 0.05 20 | topp_eos : true 21 | repeat_penalty : 0.05 22 | repeat_penalty_n : 100 23 | eos_amp : 0.50 24 | eos_amp_n : 250 25 | rand seed : 1234 26 | Generate: max 131072 tokens.. 27 | - Press 'esc' key to break generation. 28 | The explanation for the existence of seasons is rooted in the Earth's tilt and o 29 | rbit around the Sun. The tilt of the Earth is approximately 23.5 degrees, which 30 | causes the amount of sunlight that reaches the planet's surface to vary througho 31 | ut the year. During the summer months, the Northern Hemisphere is tilted towards 32 | the Sun, resulting in longer days and more direct sunlight. Conversely, during 33 | the winter months, the Northern Hemisphere is tilted away from the Sun, resultin 34 | g in shorter days and less direct sunlight. 35 | This tilt is the primary cause of the changing seasons, with the Earth's orbit a 36 | round the Sun also playing a role. The Earth's orbit is elliptical, meaning that 37 | its distance from the Sun varies throughout the year. However, the difference i 38 | n distance has a minimal impact on the seasons, as the tilt of the Earth is the 39 | dominant factor. 40 | The four seasons that occur on Earth are: 41 | 1. **Spring**: Typically begins around March 20/21 in the Northern Hemisphere an 42 | d September 22/23 in the Southern Hemisphere. During this season, the weather st 43 | arts to warm up, and days get longer. 44 | 2. **Summer**: Begins around June 20/21 in the Northern Hemisphere and December 45 | 21/22 in the Southern Hemisphere. This season is characterized by long days, war 46 | m temperatures, and often dry conditions. 47 | 3. **Autumn** (or **Fall**): Begins around September 22/23 in the Northern Hemis 48 | phere and March 20/21 in the Southern Hemisphere. As the days shorten, temperatu 49 | res cool, and leaves on trees change color before falling. 50 | 4. **Winter**: Begins around December 21/22 in the Northern Hemisphere and June 51 | 20/21 in the Southern Hemisphere. This season is marked by shorter days, colder 52 | temperatures, and often snow and ice. 53 | 54 | The seasonal variations have a significant impact on the environment, ecosystems 55 | , and human societies. Understanding the causes of the seasons is essential for 56 | predicting weather patterns, managing resources, and planning activities. 57 | total time: 72.19s for 398 tokens, tok/s: 5.51 58 | Press any key to continue . . . -------------------------------------------------------------------------------- /tests/1_node/gen_f12/gen_qwen2.txt: -------------------------------------------------------------------------------- 1 | read file run_json/run_qwen2.5.json 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 3 | conv/matmul AVX2 checks done. 4 | load tokenizer: E:/qwen/qwen2.5-7B-Instruct/tokenizer.json 5 | load transformer.. 6 | read model config in: E:/qwen/qwen2.5-7B-Instruct/config.json 7 | torch float type: bf16 8 | model weights converted to float12. 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 10 | processor(s) core(s) used: 12 in 1 node(s). 11 | load: E:/qwen/qwen2.5-7B-Instruct/model-00001-of-00004.safetensors 12 | load: E:/qwen/qwen2.5-7B-Instruct/model-00002-of-00004.safetensors 13 | load: E:/qwen/qwen2.5-7B-Instruct/model-00003-of-00004.safetensors 14 | load: E:/qwen/qwen2.5-7B-Instruct/model-00004-of-00004.safetensors 15 | warning: tokenizer/transformer vocab_size missmatch (151665/152064) 16 | sampler config: 17 | temperature : 0.60 18 | topp : 0.65 19 | topk : 25 20 | topp_minp : 0.05 21 | topp_eos : true 22 | repeat_penalty : 0.00 23 | repeat_penalty_n : 0 24 | eos_amp : 0.05 25 | eos_amp_n : 300 26 | rand seed : 1234 27 | Generate: max 32768 tokens.. 28 | - Press 'esc' key to break generation. 29 | The explanation for the existence of seasons is that the Earth's axis is tilted 30 | at an angle of 23.5 degrees relative to its orbital plane. This tilt causes diff 31 | erent parts of the Earth to receive varying amounts of sunlight throughout the y 32 | ear, leading to the changing seasons. However, the Earth's axis is not fixed in 33 | space, but rather it precesses, meaning it traces out a circle in the sky over a 34 | period of about 26,000 years. This precession affects the direction in which th 35 | e Earth's axis points, and consequently, the position of the solstices and equin 36 | oxes relative to the stars. 37 | 38 | Given that the Earth's axis is tilted at 23.5 degrees, and it precesses over a p 39 | eriod of 26,000 years, calculate the average rate of precession in degrees per y 40 | ear. Additionally, determine the change in the position of the solstices relativ 41 | e to the stars over a period of 13,000 years. To determine the average rate of p 42 | recession in degrees per year, we start with the total precession period and the 43 | total angle of precession. The Earth's axis precesses in a circle over a period 44 | of 26,000 years, and the angle of precession is 360 degrees (since it completes 45 | one full circle). 46 | 47 | The average rate of precession in degrees per year is given by: 48 | \[ 49 | \text{Average rate of precession} = \frac{360 \text{ degrees}}{26,000 \text{ yea 50 | rs}} = \frac{360}{26,000} \approx 0.013846 \text{ degrees per year} 51 | \] 52 | 53 | Next, we need to determine the change in the position of the solstices relative 54 | to the stars over a period of 13,000 years. Since the Earth's axis precesses at 55 | a rate of approximately 0.013846 degrees per year, the change in the position of 56 | the solstices over 13,000 years is: 57 | \[ 58 | \text{Change in position} = 0.013846 \text{ degrees per year} \times 13,000 \tex 59 | t{ years} = 180.00 \text{ degrees} 60 | \] 61 | 62 | Therefore, the change in the position of the solstices relative to the stars ove 63 | r a period of 13,000 years is: 64 | \[ 65 | \boxed{180 \text{ degrees}} 66 | \] 67 | total time: 93.12s for 543 tokens, tok/s: 5.83 68 | Press any key to continue . . . -------------------------------------------------------------------------------- /src/utils/l_util.h: -------------------------------------------------------------------------------- 1 | // generic functions for llama api 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) 9 | #define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) 10 | 11 | #define MBYTE (1024*1024) // 1 megabyte 12 | #define GBYTE (1024*1024*1024) // 1 gigabyte 13 | 14 | #define __no_return __declspec(noreturn) 15 | 16 | // ------------------------------------ 17 | // debug 18 | 19 | #if defined(_DEBUG) || defined(_CHECK) 20 | void debug_break(void); 21 | #define CHECK(a) ((a) ? (void)0 : debug_break()) 22 | #else 23 | #define CHECK(a) 24 | #endif 25 | 26 | // ------------------------------------ 27 | // floats checks 28 | bool check_no_nan_f32(const float *buff, size_t ne); 29 | 30 | // ------------------------------------ 31 | // softmax 32 | 33 | void softmax(float *x, int size); 34 | 35 | // ------------------------------------ 36 | // information messages 37 | 38 | void msg_info(const char *fmt, ...); 39 | 40 | // print some space (align text usage) 41 | void msg_spc(int n_spc); 42 | 43 | // ------------------------------------ 44 | // errors 45 | 46 | extern jmp_buf error_jmp; 47 | 48 | #define APP_ERROR() setjmp(error_jmp) 49 | 50 | // print error message and jump to error code 51 | void __no_return msg_error(const char *fmt, ...); 52 | 53 | // print assert error message and exit 54 | void __no_return assert_exit(const char *fmt, ...); 55 | 56 | // assert + position + exit 57 | #define _ASSERT(x) if (!(x)) assert_exit("%s:%s:%d: %s\n", __FILE__, __FUNCTION__, __LINE__, #x) 58 | 59 | // check range of an int32 value 60 | void check_range_i(int a, const char *name, int min, int max); 61 | 62 | // ------------------------------------ 63 | // rng 64 | 65 | void rand_seed(int seed); 66 | int rand_n(void); 67 | float rand1(void); 68 | float rand1s(void); 69 | 70 | // ------------------------------------ 71 | // time 72 | 73 | int time_in_ms(void); 74 | 75 | // ------------------------------------ 76 | // files, manage big files and trap errors 77 | 78 | typedef struct 79 | { 80 | void *handle; 81 | const char *name; 82 | int64_t size; 83 | // user datas 84 | int64_t seek_ofs; // offset to add to f_seek 85 | } file_t; 86 | 87 | #define f_SEEK_CUR 1 88 | #define f_SEEK_END 2 89 | #define f_SEEK_SET 0 90 | 91 | void f_seek(file_t *h, int64_t ofs, int origin); 92 | int64_t f_tell(file_t *h); 93 | 94 | void f_open(file_t *h, const char *name, const char *mode); 95 | void f_close(file_t *h); 96 | void f_read(void *p, int64_t size, file_t *h); 97 | void f_write(void *p, int64_t size, file_t *h); 98 | 99 | // ------------------------------------ 100 | // wait return pressed and exit 101 | 102 | void wait_return_exit(void); 103 | 104 | // ------------------------------------ 105 | // display progress bar for long time operations 106 | 107 | void progress_bar_init(bool new_line, int64_t max_value); 108 | void progress_bar_update(int64_t value); 109 | void progress_bar_done(void); 110 | 111 | // ajust range of float x value in [x_min..x_max] 112 | void adjust_range_f32(float *x, const char *x_name, float x_min, float x_max); 113 | 114 | // ajust range of int x value in [x_min..x_max] 115 | void adjust_range_int(int *x, const char *x_name, int x_min, int x_max); 116 | -------------------------------------------------------------------------------- /src/model/kv_cache.c: -------------------------------------------------------------------------------- 1 | // kv cache: is part of transformer.c but moved in separate file for clarity. 2 | // this code use a method to ensure kv cache is never full. (experimental.. is that good ?) 3 | // the method consists in 'forgetting' the oldest tokens. 4 | // in chat mode: 5 | // - series of one user entry + one llm reply are deleted after system prompt 6 | // in generate mode: 7 | // - only delete some first tokens. 8 | // the hole produced after systemp promt is removed and rope for kv datas following hole is updated as if tokens follow prompt without hole. 9 | // todo: difficult to test effect as require context almost full to operate. 10 | 11 | #ifdef PACK_KV_CACHE 12 | 13 | #include "l_util.h" 14 | #include "model.h" 15 | 16 | // 'forget' some tokens in kv cache to reduce context size. 17 | static void reduce_kv_cache(int min_tokens_delete) 18 | { 19 | struct transformer_t *t = &model.transformer; 20 | const struct transformer_config_t *p = &t->config; 21 | struct transformer_runstate_t *s = &t->state; 22 | 23 | int n_ctx = s->cache.n_tokens; // current tokens count in context 24 | int min_del = n_ctx/20; // min tokens to delete (5% of context) 25 | int i0, i, n_del; 26 | 27 | if (model.config.run_mode == 0) 28 | { 29 | // generate mode, very unlikely to happen (no eot produced before context full) 30 | // done to ensure no cache overflow. 31 | i0 = 0; 32 | i = min_del; 33 | } 34 | else // chat mode 35 | { 36 | if (min_tokens_delete < min_del) 37 | min_tokens_delete = min_del; 38 | 39 | i0 = t->state.cache.n_tokens_sys; // keep sys prompt if defined 40 | for (i=i0; istate.cache.tokens[i].sampled) 45 | break; 46 | 47 | // pass one llm reply 48 | for (; istate.cache.tokens[i].sampled) 50 | break; 51 | 52 | // test if enough deleted 53 | if ((i - i0) >= min_tokens_delete) 54 | break; 55 | } 56 | } 57 | 58 | // count of deleted tokens in cache 59 | n_del = i - i0; 60 | 61 | // update user info 62 | t->state.cache.n_tokens_del += n_del; 63 | 64 | #if 0 65 | // debug: display deleted token list 66 | { 67 | int j; 68 | msg_info("\n------\n kv delete %d tokens:\n", n_del); 69 | for (j=i0; jcache.tokens[j].token_id, true); 72 | msg_info(","); 73 | } 74 | msg_info("\n------\n"); 75 | } 76 | #endif 77 | 78 | // define < 0 rope rotation = - num of deleted tokens 79 | set_RoPE_pos(s->rope_sin_cos, -n_del, s->rope_freq, p->head_size/2); 80 | 81 | // compact and update kv cache rope 82 | s->cache.n_tokens = i0; 83 | s->cache.n_tokens_samp = 0; 84 | 85 | for (; icache.n_tokens++; 88 | 89 | // remove kv cache hole 90 | for (l=0; ln_layers; l++) 91 | { 92 | size_t i_ofs = ((size_t)l * n_ctx + i ) * p->kv_dim; 93 | size_t p_ofs = ((size_t)l * n_ctx + pos) * p->kv_dim; 94 | RoPE(&s->k_cache[i_ofs], &s->v_cache[i_ofs], s->rope_sin_cos, p->head_size, p->kv_dim, p->kv_dim); 95 | memcpy(&s->k_cache[p_ofs], &s->k_cache[i_ofs], p->kv_dim * sizeof(float)); 96 | memcpy(&s->v_cache[p_ofs], &s->v_cache[i_ofs], p->kv_dim * sizeof(float)); 97 | } 98 | 99 | // compact token list 100 | s->cache.tokens[pos] = s->cache.tokens[i]; 101 | if (s->cache.tokens[pos].sampled) 102 | s->cache.n_tokens_samp++; 103 | else 104 | s->cache.n_tokens_samp = 0; 105 | } 106 | } 107 | 108 | // reserve tokens in kv cache for llm generation. 109 | // return cound of deleted tokens 110 | int reserve_kv_cache(int min_token_reserve) 111 | { 112 | int token_prev = model.transformer.state.cache.n_tokens; 113 | int token_left = model.transformer.config.seq_len - token_prev; 114 | if (token_left < min_token_reserve) 115 | { 116 | reduce_kv_cache(min_token_reserve - token_left); 117 | return token_prev - model.transformer.state.cache.n_tokens; // return num of deleted tokens 118 | } 119 | return 0; 120 | } 121 | 122 | #endif // PACK_KV_CACHE -------------------------------------------------------------------------------- /tests/1_node/gen_ref/gen_mixtral_f8.txt: -------------------------------------------------------------------------------- 1 | read file run_json/run_mixtral.json 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 3 | conv/matmul AVX2 checks done. 4 | load tokenizer: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/tokenizer.json 5 | load transformer.. 6 | read model config in: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/config.json 7 | torch float type: bf16 8 | model weights converted to float8. 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 10 | processor(s) core(s) used: 12 in 1 node(s). 11 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00001-of-00019.safetensors 12 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00002-of-00019.safetensors 13 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00003-of-00019.safetensors 14 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00004-of-00019.safetensors 15 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00005-of-00019.safetensors 16 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00006-of-00019.safetensors 17 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00007-of-00019.safetensors 18 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00008-of-00019.safetensors 19 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00009-of-00019.safetensors 20 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00010-of-00019.safetensors 21 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00011-of-00019.safetensors 22 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00012-of-00019.safetensors 23 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00013-of-00019.safetensors 24 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00014-of-00019.safetensors 25 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00015-of-00019.safetensors 26 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00016-of-00019.safetensors 27 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00017-of-00019.safetensors 28 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00018-of-00019.safetensors 29 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00019-of-00019.safetensors 30 | sampler config: 31 | temperature : 0.70 32 | topp : 0.80 33 | topk : 40 34 | topp_minp : 0.05 35 | topp_eos : true 36 | repeat_penalty : 0.05 37 | repeat_penalty_n : 50 38 | eos_amp : 0.00 39 | eos_amp_n : 150 40 | rand seed : 1234 41 | Generate: max 32768 tokens.. 42 | - Press 'esc' key to break generation. 43 | The explanation for the existence of seasons is not as simple as you might think 44 | . The tilt of the Earth's axis and its orbit around the sun combine to create th 45 | e seasons, but the tilt itself is the most important factor. The Earth's axis is 46 | tilted at an angle of 23.5 degrees from the vertical, and this tilt is what cau 47 | ses the sun's rays to hit different parts of the Earth at different angles durin 48 | g different times of the year. 49 | 50 | During the summer, the Earth's tilt causes the sun's rays to hit the Earth at a 51 | more direct angle, resulting in more intense heat and longer days. In the winter 52 | , the tilt causes the sun's rays to hit the Earth at a more oblique angle, resul 53 | ting in less intense heat and shorter days. The tilt also causes the sun to appe 54 | ar higher in the sky during the summer and lower in the sky during the winter. 55 | 56 | The Earth's orbit around the sun also plays a role in the creation of seasons, b 57 | ut it is less important than the tilt of the Earth's axis. The Earth's orbit is 58 | not a perfect circle, but rather an ellipse. This means that the Earth is closer 59 | to the sun at certain times of the year and farther away at other times. Howeve 60 | r, the difference in distance between the Earth and the sun is not enough to sig 61 | nificantly affect the intensity of the sun's rays or the length of the days. 62 | 63 | In summary, the tilt of the Earth's axis is the primary factor responsible for t 64 | he creation of seasons. The Earth's orbit around the sun also plays a role, but 65 | it is less important than the tilt. The tilt causes the sun's rays to hit differ 66 | ent parts of the Earth at different angles, resulting in the varying temperature 67 | s and day lengths that we associate with the seasons. 68 | total time: 79.28s for 384 tokens, tok/s: 4.84 69 | Press any key to continue . . . -------------------------------------------------------------------------------- /src/utils/mem_alloc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "l_util.h" 3 | #include "mem_alloc.h" 4 | 5 | #define ALLOC_SZ_ALIGN (256/8) // align for AVX 6 | 7 | // ------------------------------------ 8 | // memory allocation + check 9 | 10 | #if !defined(_DEBUG) && !defined(CHECK_ALLOC) 11 | 12 | void *malloc_check(size_t size) 13 | { 14 | void *p = _aligned_malloc(size, ALLOC_SZ_ALIGN); 15 | if (!p) 16 | msg_error("malloc failed to alloc %d bytes\n", size); 17 | return p; 18 | } 19 | 20 | void *calloc_check(size_t size) 21 | { 22 | void *p = _aligned_malloc(size, ALLOC_SZ_ALIGN); 23 | if (!p) 24 | msg_error("calloc failed to alloc %d bytes\n", size); 25 | memset(p, 0, size); 26 | return p; 27 | } 28 | 29 | void *realloc_check(void *ptr, size_t size) 30 | { 31 | void *p = _aligned_realloc(ptr, size, ALLOC_SZ_ALIGN); 32 | if (!p) 33 | msg_error("realloc failed to alloc %d bytes\n", size); 34 | return p; 35 | } 36 | 37 | void free_check(void *ptr) 38 | { 39 | if (ptr) 40 | _aligned_free(ptr); 41 | } 42 | 43 | void dbg_print_alloc(void) 44 | { 45 | // not checked in release 46 | } 47 | 48 | #else 49 | 50 | // debug malloc + allocated size infos 51 | 52 | // stats 53 | struct 54 | { 55 | size_t size_alloc_sum; // current allocated size 56 | size_t size_alloc_sum_max; // max reached allocated size 57 | size_t size_block_max; // max allocated block size 58 | int n_header; // current header in use 59 | int n_malloc; // sum count 60 | int n_realloc; // sum count 61 | } a_inf = { 0 }; 62 | 63 | // alloc header, use size that keep alignment for sse 64 | typedef union 65 | { 66 | size_t sz; 67 | char mem[ALLOC_SZ_ALIGN]; 68 | } a_hdr; 69 | 70 | // return aligned address 71 | static void *mem_align(a_hdr *h, size_t alloc_sz) 72 | { 73 | void *p; 74 | if (alloc_sz > a_inf.size_block_max) 75 | a_inf.size_block_max = alloc_sz; 76 | 77 | if (a_inf.size_alloc_sum > a_inf.size_alloc_sum_max) 78 | a_inf.size_alloc_sum_max = a_inf.size_alloc_sum; 79 | 80 | h->sz = alloc_sz; 81 | p = h + 1; 82 | CHECK(((size_t)p & (ALLOC_SZ_ALIGN-1)) == 0); 83 | return p; 84 | } 85 | 86 | void *malloc_check(size_t size) 87 | { 88 | a_hdr *h = (a_hdr *)_aligned_malloc(size + sizeof(a_hdr), ALLOC_SZ_ALIGN); 89 | if (!h) 90 | msg_error("malloc failed to alloc %d bytes\n", size); 91 | 92 | memset(h, 0, sizeof(a_hdr)); // 0 unused bytes 93 | a_inf.size_alloc_sum += size; 94 | a_inf.n_header++; 95 | a_inf.n_malloc++; 96 | return mem_align(h, size); 97 | } 98 | 99 | void *calloc_check(size_t size) 100 | { 101 | void *p = malloc_check(size); 102 | memset(p, 0, size); 103 | return p; 104 | } 105 | 106 | void *realloc_check(void *ptr, size_t size) 107 | { 108 | if (ptr) 109 | { 110 | a_hdr *h = (a_hdr *)ptr - 1; 111 | a_inf.size_alloc_sum -= h->sz; 112 | CHECK(a_inf.size_alloc_sum >= 0); 113 | h = (a_hdr *)_aligned_realloc(h, size + sizeof(a_hdr), ALLOC_SZ_ALIGN); 114 | if (!h) 115 | msg_error("realloc failed to alloc %d bytes\n", size); 116 | 117 | a_inf.size_alloc_sum += size; 118 | a_inf.n_realloc++; 119 | return mem_align(h, size); 120 | } 121 | return malloc_check(size); 122 | } 123 | 124 | void free_check(void *ptr) 125 | { 126 | if (ptr) 127 | { 128 | a_hdr *h = (a_hdr *)ptr - 1; 129 | a_inf.size_alloc_sum -= h->sz; 130 | a_inf.n_header--; 131 | CHECK(a_inf.size_alloc_sum >= 0); 132 | CHECK(a_inf.n_header >= 0); 133 | _aligned_free(h); 134 | } 135 | } 136 | 137 | // debug info 138 | void dbg_print_alloc(void) 139 | { 140 | msg_info("INFO mem alloc:\n"); 141 | msg_info(" size_alloc_sum %.6f Mb\n", (double)a_inf.size_alloc_sum / (1024*1024)); 142 | msg_info(" size_block_max; %.6f Mb\n", (double)a_inf.size_block_max / (1024*1024)); 143 | msg_info(" n_header %d\n", a_inf.n_header); 144 | msg_info(" n_malloc %d\n", a_inf.n_malloc); 145 | msg_info(" n_realloc %d\n", a_inf.n_realloc); 146 | msg_info(" size_alloc_sum_max %.6f Mb\n", (double)a_inf.size_alloc_sum_max / (1024*1024)); 147 | if (a_inf.size_alloc_sum || a_inf.n_header) 148 | msg_info(" >some memory is still allocated: %u bytes\n", (int)a_inf.size_alloc_sum); 149 | else 150 | msg_info(" >all memory has been freed.\n"); 151 | } 152 | 153 | #endif 154 | 155 | // alloc string 156 | char *str_alloc(const char *str, int len) 157 | { 158 | char *s; 159 | CHECK(len >= 0); 160 | s = malloc_check(len+1); 161 | memcpy(s, str, len); 162 | s[len] = 0; 163 | return s; 164 | } 165 | -------------------------------------------------------------------------------- /src/model/model.h: -------------------------------------------------------------------------------- 1 | // user application header 2 | #include "transformer.h" 3 | #include "tokenizer.h" 4 | #include "sampler.h" 5 | 6 | // model identifier 7 | enum e_model_id 8 | { 9 | model_id_tinyllama = 0, // "tinyllama", 10 | model_id_llama1, // "llama1", 11 | model_id_llama2, // "llama2", 12 | model_id_code_llama, // "codellama", 13 | model_id_llama3, // "llama3", 14 | model_id_llama31, // "llama31", 15 | model_id_mistral, // "mistral", 16 | model_id_mathstral, // "mathstral", 17 | model_id_zephyr, // "zephyr", 18 | model_id_mixtral, // "mixtral", 19 | model_id_vigogne2, // "vigogne2", 20 | model_id_qwen2, // "qwen2", 21 | model_id_count, // models count 22 | }; 23 | 24 | extern const char *model_id_names[model_id_count]; 25 | 26 | // application run mode 27 | enum e_run_mode 28 | { 29 | run_mode_generate = 0, 30 | run_mode_chat, 31 | }; 32 | 33 | // chat mode config 34 | struct chat_cfg_t 35 | { 36 | bool chat_use_colors; // use colors for user/assistant text 37 | 38 | // forward tokens display options 39 | int fwd_disp_mode; // 0: display nothing, 1: tokens list 40 | 41 | // method used to generate the chat prompt format 42 | int chat_prompt_mode; 43 | 44 | // prompt names displayed for assistant and user 45 | char *chat_assistant_name; 46 | char *chat_user_name; 47 | 48 | // mode 0 49 | char *cm0_sys_prompt; 50 | char *cm0_user_prompt; 51 | 52 | // mode 1 53 | char *cm1_sys_template; 54 | char *cm1_user_first_template; 55 | char *cm1_user_template; 56 | char *cm1_end_template; 57 | char *cm1_sys_prompt; 58 | char *cm1_user_prompt; 59 | 60 | // mode 2 61 | char *cm2_sys_template; 62 | char *cm2_user_template; 63 | char *cm2_user_name_sw; // swith user/assistant string in generate mode 64 | char *cm2_sys_prompt; 65 | char *cm2_user_prompt; 66 | }; 67 | 68 | // run configuration defined in json 69 | struct run_conf_t 70 | { 71 | // model identifier 72 | char *model_ident; // define model type for model specificities 73 | 74 | // model load 75 | struct 76 | { 77 | int model_num_safetensors; // count of .safetensors files in model 78 | char *model_path; // path to model, ex "C:/llama2/llama2-7b-chat-hf" 79 | char *tokenizer_name; // tokenizer file name (ex tokenizer.json) 80 | } load; 81 | 82 | // set or override rope freq 83 | float rope_set; // set/change rope inv freq value, ignored if 0 84 | 85 | // sampler config defined in sampler struct 86 | 87 | // load parameters 88 | bool cvt_sf16; // convert model to sfloat16 at load 89 | bool cvt_f12; // convert model to float12 at load 90 | bool cvt_f8; // convert model to float8 at load 91 | 92 | // hardware parameters 93 | int num_procs; // num procs used for threads 94 | int numa_nodes; // num numa nodes to init 95 | int simd_mode; // -1: best auto, 0:off(fpu) 1:sse 2:avx 96 | 97 | // checks 98 | bool test_nan_logits; // test for NAN at sampling in all logits results 99 | 100 | // run mode 101 | enum e_run_mode run_mode; // 0: generate, 1:chat 102 | int gen_run_steps; // number of steps to run. 0 = max (model max_seq_len) 103 | char *token_eos_str; // end of string token (assistant reply end) 104 | char *token_eot_str; // end of text token (dialog/generate end) 105 | 106 | // token display option 107 | bool tok_disp_raw; 108 | bool tok_disp_split; // separate each token with ',' 109 | bool tok_disp_prob; // display sampling information 110 | 111 | // generate mode config 112 | char *gen_mode_prompt; // init prompt for generate run_mode 113 | 114 | // chat mode config 115 | struct chat_cfg_t chat; 116 | 117 | // defined using strings 118 | enum e_model_id e_model_id; 119 | int token_eos; // eos token 120 | int token_eot; // eot token 121 | }; 122 | 123 | struct model_t 124 | { 125 | struct run_conf_t config; 126 | struct tokenizer_t tokenizer; 127 | struct transformer_t transformer; 128 | struct sampler_t sampler; 129 | }; 130 | 131 | extern struct model_t model; 132 | 133 | void build_model(const char *conf_file_name); 134 | 135 | void free_model(void); 136 | 137 | // chat loop 138 | void chat(void); 139 | 140 | // generation loop 141 | void generate(void); 142 | -------------------------------------------------------------------------------- /tests/1_node/gen_ref/gen_qwen2.txt: -------------------------------------------------------------------------------- 1 | read file run_json/run_qwen2.5.json 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 3 | conv/matmul AVX2 checks done. 4 | load tokenizer: E:/qwen/qwen2.5-7B-Instruct/tokenizer.json 5 | load transformer.. 6 | read model config in: E:/qwen/qwen2.5-7B-Instruct/config.json 7 | torch float type: bf16 8 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 9 | processor(s) core(s) used: 12 in 1 node(s). 10 | load: E:/qwen/qwen2.5-7B-Instruct/model-00001-of-00004.safetensors 11 | load: E:/qwen/qwen2.5-7B-Instruct/model-00002-of-00004.safetensors 12 | load: E:/qwen/qwen2.5-7B-Instruct/model-00003-of-00004.safetensors 13 | load: E:/qwen/qwen2.5-7B-Instruct/model-00004-of-00004.safetensors 14 | warning: tokenizer/transformer vocab_size missmatch (151665/152064) 15 | sampler config: 16 | temperature : 0.60 17 | topp : 0.65 18 | topk : 25 19 | topp_minp : 0.05 20 | topp_eos : true 21 | repeat_penalty : 0.00 22 | repeat_penalty_n : 0 23 | eos_amp : 0.05 24 | eos_amp_n : 300 25 | rand seed : 1234 26 | Generate: max 32768 tokens.. 27 | - Press 'esc' key to break generation. 28 | The explanation for the existence of seasons is that the Earth's axis is tilted 29 | at an angle of 23.5 degrees relative to its orbital plane. This tilt causes diff 30 | erent parts of the Earth to receive varying amounts of sunlight throughout the y 31 | ear, leading to the changing seasons. However, the Earth's axis is not fixed in 32 | space, but rather it precesses, meaning it traces out a circle in the sky over a 33 | period of about 26,000 years. This precession affects the direction in which th 34 | e Earth's axis points, and consequently, the position of the solstices and equin 35 | oxes relative to the stars. 36 | 37 | Given that the Earth's axis is tilted at 23.5 degrees, and it precesses over a p 38 | eriod of 26,000 years, calculate the average rate of precession in degrees per y 39 | ear. Additionally, determine the change in the position of the solstices relativ 40 | e to the stars over a period of 13,000 years. To determine the average rate of p 41 | recession in degrees per year, we start with the total precession period and the 42 | total angle of precession. The Earth's axis precesses in a circle over a period 43 | of 26,000 years, and the angle of precession is 360 degrees (since it completes 44 | one full circle). 45 | 46 | The average rate of precession in degrees per year is given by: 47 | \[ 48 | \text{Average rate of precession} = \frac{360 \text{ degrees}}{26,000 \text{ yea 49 | rs}} = \frac{360}{26,000} \approx 0.013846 \text{ degrees per year} 50 | \] 51 | 52 | Next, we need to determine the change in the position of the solstices relative 53 | to the stars over a period of 13,000 years. Since the Earth's axis precesses at 54 | a rate of approximately 0.013846 degrees per year, the change in the position of 55 | the solstices over 13,000 years is: 56 | \[ 57 | \text{Change in position} = 0.013846 \text{ degrees per year} \times 13,000 \tex 58 | t{ years} = 180.00 \text{ degrees} 59 | \] 60 | 61 | Therefore, the change in the position of the solstices relative to the stars ove 62 | r a period of 13,000 years is: 63 | \[ 64 | \boxed{180 \text{ degrees}} 65 | \] 66 | total time: 114.35s for 543 tokens, tok/s: 4.75 67 | Press any key to continue . . . 68 | 69 | 70 | 71 | 72 | read file run_json/run_qwen2.5.json 73 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1 74 | conv/matmul AVX2 checks done. 75 | load tokenizer: E:/qwen/qwen2.5-0.5B/tokenizer.json 76 | load transformer.. 77 | read model config in: E:/qwen/qwen2.5-0.5B/config.json 78 | torch float type: bf16 79 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off) 80 | processor(s) core(s) used: 12 in 1 node(s). 81 | load: E:/qwen/qwen2.5-0.5B/model.safetensors 82 | info: classifier use embed_tokens.weight. 83 | warning: tokenizer/transformer vocab_size missmatch (151665/151936) 84 | sampler config: 85 | temperature : 0.60 86 | topp : 0.65 87 | topk : 25 88 | topp_minp : 0.05 89 | topp_eos : true 90 | repeat_penalty : 0.00 91 | repeat_penalty_n : 0 92 | eos_amp : 0.05 93 | eos_amp_n : 300 94 | rand seed : 1234 95 | Generate: max 32768 tokens.. 96 | - Press 'esc' key to break generation. 97 | The explanation for the existence of seasons is that the Earth's axis is tilted 98 | at an angle of approximately 23.5 degrees relative to its orbit around the Sun. 99 | This tilt causes the same amount of sunlight to fall on the northern and souther 100 | n hemispheres at different times of the year. This is why the seasons are observ 101 | ed. 102 | total time: 1.22s for 67 tokens, tok/s: 55.05 103 | Press any key to continue . . . -------------------------------------------------------------------------------- /tests/2_nodes/res_2sockets.txt: -------------------------------------------------------------------------------- 1 | LLama2 7B 2 | C:\dev_c\llama_st>llama_stw run_json/run_llama2.json 3 | read model config in: D:/llama2/llama2-7b-chat-hf/config.json 4 | 5 | numa node(s): 2, mp node: 1, num logical/physical procs.: 48/24 (HT on) 6 | 7 | processor(s) used: 24 in 2 node(s). 8 | total time: 28.00s for 249 tokens, tok/s: 8.89 9 | 10 | processor(s) used: 22 in 2 node(s). 11 | total time: 27.49s for 249 tokens, tok/s: 9.06 12 | 13 | processor(s) used: 20 in 2 node(s). 14 | total time: 27.92s for 249 tokens, tok/s: 8.92 15 | 16 | processor(s) used: 18 in 2 node(s). 17 | total time: 28.55s for 249 tokens, tok/s: 8.72 18 | 19 | processor(s) used: 16 in 2 node(s). 20 | total time: 29.76s for 249 tokens, tok/s: 8.37 21 | 22 | 23 | model weights converted to float12. 24 | 25 | processor(s) used: 24 in 2 node(s). 26 | total time: 22.87s for 249 tokens, tok/s: 10.89 27 | 28 | processor(s) used: 22 in 2 node(s). 29 | total time: 24.57s for 249 tokens, tok/s: 10.13 30 | 31 | processor(s) used: 20 in 2 node(s). 32 | total time: 26.27s for 249 tokens, tok/s: 9.48 33 | 34 | processor(s) used: 18 in 2 node(s). 35 | total time: 28.89s for 249 tokens, tok/s: 8.62 36 | 37 | processor(s) used: 16 in 2 node(s). 38 | total time: 32.35s for 249 tokens, tok/s: 7.70 39 | 40 | 41 | // -------------------------------------- 42 | 43 | read model config in: D:/qwen/qwen2.5-7B-Instruct/config.json 44 | 45 | processor(s) used: 24 in 2 node(s). 46 | total time: 63.40s for 543 tokens, tok/s: 8.56 47 | 48 | processor(s) used: 22 in 2 node(s). 49 | total time: 63.74s for 543 tokens, tok/s: 8.52 50 | 51 | processor(s) used: 20 in 2 node(s). 52 | total time: 64.12s for 543 tokens, tok/s: 8.47 53 | 54 | processor(s) used: 18 in 2 node(s). 55 | total time: 66.64s for 543 tokens, tok/s: 8.15 56 | 57 | processor(s) used: 16 in 2 node(s). 58 | total time: 69.56s for 543 tokens, tok/s: 7.81 59 | 60 | f12 61 | 62 | processor(s) used: 24 in 2 node(s). 63 | total time: 52.88s for 543 tokens, tok/s: 10.27 64 | 65 | processor(s) used: 22 in 2 node(s). 66 | total time: 59.48s for 543 tokens, tok/s: 9.13 67 | 68 | processor(s) used: 20 in 2 node(s). 69 | total time: 65.05s for 543 tokens, tok/s: 8.35 70 | 71 | 72 | 73 | load tokenizer: D:/qwen/qwen2.5-72b-intruct/tokenizer.json 74 | model weights converted to float12. 75 | 76 | processor(s) used: 24 in 2 node(s). 77 | total time: 924.74s for 480 tokens, tok/s: 0.52 78 | 79 | 80 | 81 | 82 | load tokenizer: D:/mixtral/Mixtral-8x7B-Instruct-v0.1/tokenizer.json 83 | torch float type: bf16 84 | 85 | processor(s) core(s) used: 12 in 1 node(s). (HT off) 86 | total time: 174.74s for 282 tokens, tok/s: 1.61 87 | 88 | processor(s) core(s) used: 12 in 2 node(s). 89 | total time: 111.21s for 282 tokens, tok/s: 2.54 90 | 91 | processor(s) core(s) used: 24 in 2 node(s). 92 | total time: 136.61s for 282 tokens, tok/s: 2.06 (slown down occured) 93 | 94 | processor(s) core(s) used: 22 in 2 node(s). 95 | total time: 99.81s for 282 tokens, tok/s: 2.83 96 | 97 | processor(s) core(s) used: 20 in 2 node(s). 98 | total time: 108.33s for 282 tokens, tok/s: 2.60 99 | 100 | model weights converted to float12. 101 | 102 | processor(s) core(s) used: 10 in 1 node(s). (HT off) 103 | total time: 208.03s for 432 tokens, tok/s: 2.08 104 | 105 | processor(s) core(s) used: 10 in 2 node(s). 106 | total time: 182.43s for 432 tokens, tok/s: 2.37 107 | 108 | processor(s) core(s) used: 24 in 2 node(s). 109 | total time: 181.87s for 432 tokens, tok/s: 2.38 110 | 111 | processor(s) core(s) used: 22 in 2 node(s). 112 | total time: 137.13s for 432 tokens, tok/s: 3.15 113 | 114 | processor(s) core(s) used: 20 in 2 node(s). 115 | total time: 122.16s for 432 tokens, tok/s: 3.54 116 | 117 | processor(s) core(s) used: 18 in 2 node(s). 118 | total time: 144.04s for 432 tokens, tok/s: 3.00 119 | 120 | HT on 121 | 122 | torch float type: bf16 123 | processor(s) core(s) used: 24 in 2 node(s). 124 | total time: 83.26s for 282 tokens, tok/s: 3.39 125 | 126 | processor(s) core(s) used: 22 in 2 node(s). 127 | total time: 98.09s for 282 tokens, tok/s: 2.87 128 | 129 | model weights converted to float12. 130 | processor(s) core(s) used: 24 in 2 node(s). 131 | total time: 134.55s for 432 tokens, tok/s: 3.21 132 | 133 | processor(s) core(s) used: 22 in 2 node(s). 134 | total time: 132.07s for 432 tokens, tok/s: 3.27 135 | 136 | processor(s) core(s) used: 20 in 2 node(s). 137 | total time: 139.39s for 432 tokens, tok/s: 3.10 138 | 139 | 140 | torch float type: bf16 141 | processor(s) used: 24 in 2 node(s). 142 | node 0 procs: 0,2,4,6,8,10,12,14,16,18,20,22, 143 | node 1 procs: 24,26,28,30,32,34,36,38,40,42,44,46, 144 | total time: 114.35s for 282 tokens, tok/s: 2.47 145 | 146 | processor(s) used: 22 in 2 node(s). 147 | total time: 95.71s for 282 tokens, tok/s: 2.95 148 | 149 | model weights converted to float12. 150 | numa node(s): 2, mp node: 1, num logical/physical procs.: 48/24 (HT on) 151 | processor(s) used: 22 in 2 node(s). 152 | total time: 125.69s for 432 tokens, tok/s: 3.44 153 | 154 | 155 | -------------------------------------------------------------------------------- /tests/2_nodes/llama2_ht_off.txt: -------------------------------------------------------------------------------- 1 | LLama2 7B 2 | C:\dev_c\llama_st>llama_stw run_json/run_llama2.json 3 | read model config in: D:/llama2/llama2-7b-chat-hf/config.json 4 | 5 | numa node(s): 2, mp node: 1, num logical/physical procs.: 48/24 (HT on) 6 | 7 | processor(s) used: 24 in 2 node(s). 8 | total time: 28.00s for 249 tokens, tok/s: 8.89 9 | 10 | processor(s) used: 22 in 2 node(s). 11 | total time: 27.49s for 249 tokens, tok/s: 9.06 12 | 13 | processor(s) used: 20 in 2 node(s). 14 | total time: 27.92s for 249 tokens, tok/s: 8.92 15 | 16 | processor(s) used: 18 in 2 node(s). 17 | total time: 28.55s for 249 tokens, tok/s: 8.72 18 | 19 | processor(s) used: 16 in 2 node(s). 20 | total time: 29.76s for 249 tokens, tok/s: 8.37 21 | 22 | 23 | model weights converted to float12. 24 | 25 | processor(s) used: 24 in 2 node(s). 26 | total time: 22.87s for 249 tokens, tok/s: 10.89 27 | 28 | processor(s) used: 22 in 2 node(s). 29 | total time: 24.57s for 249 tokens, tok/s: 10.13 30 | 31 | processor(s) used: 20 in 2 node(s). 32 | total time: 26.27s for 249 tokens, tok/s: 9.48 33 | 34 | processor(s) used: 18 in 2 node(s). 35 | total time: 28.89s for 249 tokens, tok/s: 8.62 36 | 37 | processor(s) used: 16 in 2 node(s). 38 | total time: 32.35s for 249 tokens, tok/s: 7.70 39 | 40 | 41 | // -------------------------------------- 42 | 43 | read model config in: D:/qwen/qwen2.5-7B-Instruct/config.json 44 | 45 | processor(s) used: 24 in 2 node(s). 46 | total time: 63.40s for 543 tokens, tok/s: 8.56 47 | 48 | processor(s) used: 22 in 2 node(s). 49 | total time: 63.74s for 543 tokens, tok/s: 8.52 50 | 51 | processor(s) used: 20 in 2 node(s). 52 | total time: 64.12s for 543 tokens, tok/s: 8.47 53 | 54 | processor(s) used: 18 in 2 node(s). 55 | total time: 66.64s for 543 tokens, tok/s: 8.15 56 | 57 | processor(s) used: 16 in 2 node(s). 58 | total time: 69.56s for 543 tokens, tok/s: 7.81 59 | 60 | f12 61 | 62 | processor(s) used: 24 in 2 node(s). 63 | total time: 52.88s for 543 tokens, tok/s: 10.27 64 | 65 | processor(s) used: 22 in 2 node(s). 66 | total time: 59.48s for 543 tokens, tok/s: 9.13 67 | 68 | processor(s) used: 20 in 2 node(s). 69 | total time: 65.05s for 543 tokens, tok/s: 8.35 70 | 71 | C:\dev_c\llama_st> 72 | 73 | 74 | load tokenizer: D:/qwen/qwen2.5-72b-intruct/tokenizer.json 75 | model weights converted to float12. 76 | processor(s) used: 24 in 2 node(s). 77 | total time: 924.74s for 480 tokens, tok/s: 0.52 78 | 79 | 80 | 81 | 82 | load tokenizer: D:/mixtral/Mixtral-8x7B-Instruct-v0.1/tokenizer.json 83 | torch float type: bf16 84 | processor(s) core(s) used: 12 in 1 node(s). (HT off) 85 | total time: 174.74s for 282 tokens, tok/s: 1.61 86 | 87 | processor(s) core(s) used: 12 in 2 node(s). 88 | total time: 111.21s for 282 tokens, tok/s: 2.54 89 | 90 | processor(s) core(s) used: 24 in 2 node(s). 91 | total time: 136.61s for 282 tokens, tok/s: 2.06 92 | 93 | processor(s) core(s) used: 22 in 2 node(s). 94 | total time: 99.81s for 282 tokens, tok/s: 2.83 95 | 96 | processor(s) core(s) used: 20 in 2 node(s). 97 | total time: 108.33s for 282 tokens, tok/s: 2.60 98 | 99 | model weights converted to float12. 100 | 101 | processor(s) core(s) used: 10 in 1 node(s). (HT off) 102 | total time: 208.03s for 432 tokens, tok/s: 2.08 103 | 104 | processor(s) core(s) used: 10 in 2 node(s). 105 | total time: 182.43s for 432 tokens, tok/s: 2.37 106 | 107 | processor(s) core(s) used: 24 in 2 node(s). 108 | total time: 181.87s for 432 tokens, tok/s: 2.38 109 | 110 | processor(s) core(s) used: 22 in 2 node(s). 111 | total time: 137.13s for 432 tokens, tok/s: 3.15 112 | 113 | processor(s) core(s) used: 20 in 2 node(s). 114 | total time: 122.16s for 432 tokens, tok/s: 3.54 115 | 116 | processor(s) core(s) used: 18 in 2 node(s). 117 | total time: 144.04s for 432 tokens, tok/s: 3.00 118 | 119 | HT on 120 | 121 | torch float type: bf16 122 | processor(s) core(s) used: 24 in 2 node(s). 123 | total time: 83.26s for 282 tokens, tok/s: 3.39 124 | 125 | processor(s) core(s) used: 22 in 2 node(s). 126 | total time: 98.09s for 282 tokens, tok/s: 2.87 127 | 128 | model weights converted to float12. 129 | processor(s) core(s) used: 24 in 2 node(s). 130 | total time: 134.55s for 432 tokens, tok/s: 3.21 131 | 132 | processor(s) core(s) used: 22 in 2 node(s). 133 | total time: 132.07s for 432 tokens, tok/s: 3.27 134 | 135 | processor(s) core(s) used: 20 in 2 node(s). 136 | total time: 139.39s for 432 tokens, tok/s: 3.10 137 | 138 | 139 | // modif numa 140 | torch float type: bf16 141 | processor(s) used: 24 in 2 node(s). 142 | node 0 procs: 0,2,4,6,8,10,12,14,16,18,20,22, 143 | node 1 procs: 24,26,28,30,32,34,36,38,40,42,44,46, 144 | total time: 114.35s for 282 tokens, tok/s: 2.47 145 | 146 | processor(s) used: 22 in 2 node(s). 147 | total time: 95.71s for 282 tokens, tok/s: 2.95 148 | 149 | model weights converted to float12. 150 | numa node(s): 2, mp node: 1, num logical/physical procs.: 48/24 (HT on) 151 | processor(s) used: 22 in 2 node(s). 152 | total time: 125.69s for 432 tokens, tok/s: 3.44 153 | 154 | 155 | -------------------------------------------------------------------------------- /src/matmul/matmul_f16.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "mm_hsum.h" 3 | #include "w_types.h" 4 | #include "matmul.h" 5 | #include "matmul_priv.h" 6 | 7 | // ------------------------------------------------------------------ 8 | // conversion f16 => f32 9 | // ------------------------------------------------------------------ 10 | 11 | static float *lut_f16_to_f32 = NULL; 12 | 13 | // must be used to create conversion lut only 14 | static void cvt_f16_to_f32_fpu(float *f32, const f16_t *f16, size_t ne) 15 | { 16 | size_t i; 17 | for (i=0; i!=ne; i++) 18 | f32[i] = lut_f16_to_f32[f16[i]]; 19 | } 20 | 21 | static void cvt_f16_to_f32_sse(float *f32, const f16_t *f16, size_t ne) 22 | { 23 | size_t i; 24 | for (i=0; i!=ne; i+=4) 25 | _mm_store_ps(f32 + i, _mm_cvtph_ps(_mm_loadl_epi64((__m128i *)(f16 + i)))); 26 | } 27 | 28 | static void cvt_f16_to_f32_avx1(float *f32, const f16_t *f16, size_t ne) 29 | { 30 | size_t i; 31 | for (i=0; i!=ne; i+=8) 32 | _mm256_store_ps(f32 + i, _mm256_cvtph_ps(_mm_load_si128((__m128i *)(f16 + i)))); 33 | } 34 | 35 | const cvt_f16_to_f32_t cvt_f16_to_f32_procs[simd_n] = 36 | { 37 | cvt_f16_to_f32_fpu, 38 | cvt_f16_to_f32_sse, 39 | cvt_f16_to_f32_avx1, 40 | NULL, 41 | }; 42 | 43 | // ------------------------------------------------------------------ 44 | // matmul f32 * f16 => f32 45 | // ------------------------------------------------------------------ 46 | 47 | // is very slow, usable for very small models 48 | static void matmul_f32_f16_fpu(float *res, const float *vec, const f16_t *mat, int len_vec, int y_mat) 49 | { 50 | const f16_t *m, *m_end = mat + y_mat * len_vec; 51 | for (m=mat; m!=m_end; m+=len_vec) 52 | { 53 | float acc = 0; 54 | int i; 55 | for (i=0; i!=len_vec; i++) 56 | acc += vec[i] * lut_f16_to_f32[m[i]]; 57 | *res++ = acc; 58 | } 59 | } 60 | 61 | static void matmul_f32_f16_sse(float *res, const float *vec, const f16_t *mat, int len_vec, int y_mat) 62 | { 63 | const f16_t *m, *m_end = mat + y_mat * len_vec; 64 | for (m=mat; m!=m_end; m+=len_vec) 65 | { 66 | __m128 acc = _mm_setzero_ps(); // init 0 in sum 67 | int i; 68 | for (i=0; i!=len_vec; i+=4) 69 | acc = _mm_fmadd_ps(_mm_cvtph_ps(_mm_loadl_epi64((__m128i *)(m + i))), _mm_load_ps(vec + i), acc); 70 | *res++ = hsum_ps_sse(acc); 71 | } 72 | } 73 | 74 | static void matmul_f32_f16_avx1(float *res, const float *vec, const f16_t *mat, int len_vec, int y_mat) 75 | { 76 | const f16_t *m, *m_end = mat + y_mat * len_vec; 77 | for (m=mat; m!=m_end; m+=len_vec) 78 | { 79 | __m256 acc = _mm256_setzero_ps(); 80 | int i; 81 | for (i=0; i!=len_vec; i+=8) 82 | acc = _mm256_fmadd_ps(_mm256_cvtph_ps(_mm_load_si128((__m128i *)(m + i))), _mm256_load_ps(vec + i), acc); 83 | *res++ = hsum_ps_avx1(acc); 84 | } 85 | } 86 | 87 | // init functions list 88 | const matmul_f32_f16_t matmul_f32_f16_procs[simd_n] = 89 | { 90 | matmul_f32_f16_fpu, 91 | matmul_f32_f16_sse, 92 | matmul_f32_f16_avx1, 93 | NULL, 94 | }; 95 | 96 | // ------------------------------------------------------------------ 97 | // F16 conversions 98 | // ------------------------------------------------------------------ 99 | 100 | #include "l_util.h" 101 | #include "mem_alloc.h" 102 | 103 | // -------------------------------------------------------- 104 | // software conversion f16 to f32 if no CPU support of F16C 105 | // (opterons 62xx, xeon E55xx, x56xx, xeon E5 v1, ..) 106 | // used for data conversion to sf16 only. 107 | 108 | // software convert if no F16C support 109 | static f16_t sw_cvt_f32_to_f16(float f32) 110 | { 111 | const uint32_t b = (*(uint32_t*)&f32) + 0x00001000; 112 | const uint32_t e = (b & 0x7F800000) >> 23; 113 | uint32_t r = (b & 0x80000000) >> 16; 114 | if (e > 101) 115 | { 116 | const uint32_t m = b & 0x007FFFFF; 117 | if (e < 113) r |= (((0x007FF000 + m) >> (125-e)) + 1) >> 1; 118 | else 119 | { 120 | r |= (((e - 112) << 10) & 0x7C00) | m >> 13; 121 | if (e > 143) r |= 0x7FFF; 122 | } 123 | } 124 | return (f16_t)r; 125 | } 126 | 127 | // convert buffer f32 to f16 128 | void cvt_f32_to_f16(f16_t *f16, const float *f32, size_t ne) 129 | { 130 | size_t i; 131 | if (matmul_procs.cpu_f16c) 132 | { 133 | for (i=0; i!=ne; i+=4) 134 | { 135 | __m128i h4 = _mm_cvtps_ph(_mm_loadu_ps(f32 + i), _MM_FROUND_TO_NEAREST_INT); // convert to 4 float 16 136 | _mm_storel_epi64((__m128i *)(f16 + i), h4); 137 | } 138 | } 139 | else 140 | { 141 | for (i=0; i!=ne; i++) 142 | f16[i] = sw_cvt_f32_to_f16(f32[i]); 143 | } 144 | } 145 | 146 | static float sw_cvt_f16_to_f32(f16_t f16) 147 | { 148 | const uint32_t e = (f16 & 0x7C00) >> 10; // exponent 149 | const uint32_t m = (f16 & 0x03FF) << 13; // mantissa 150 | uint32_t r = (f16 & 0x8000) << 16; 151 | 152 | if (e) r |= ((e + 112) << 23 | m); 153 | else if (m) 154 | { 155 | const float f = (float)m; 156 | const uint32_t v = (*(uint32_t*)&f)>>23; 157 | r |= ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000)); 158 | } 159 | return *(float *)&r; 160 | } 161 | 162 | void init_sw_f16c(void) 163 | { 164 | int i; 165 | lut_f16_to_f32 = malloc_check(N_64K*sizeof(float)); 166 | for (i=0; i 2 | #include "mm_hsum.h" 3 | #include "w_types.h" 4 | #include "matmul.h" 5 | 6 | // ------------------------------------------------------------------ 7 | // conversion bf16 => f32 8 | // ------------------------------------------------------------------ 9 | 10 | static void cvt_bf16_to_f32_fpu(float *f32, const bf16_t *bf16, size_t ne) 11 | { 12 | int *ps = (int *)f32; 13 | size_t i; 14 | for (i=0; i f32 63 | // ------------------------------------------------------------------ 64 | 65 | static void matmul_f32_bf16_fpu(float *res, const float *vec, const bf16_t *mat, int len_vec, int y_mat) 66 | { 67 | const bf16_t *m, *m_end = mat + y_mat * len_vec; 68 | for (m=mat; m!=m_end; m+=len_vec) 69 | { 70 | float acc = 0; 71 | int i; 72 | for (i=0; i!=len_vec; i++) 73 | { 74 | unsigned int _f = m[i] << 16; 75 | acc += vec[i] * *(float *)&_f; 76 | } 77 | *res++ = acc; 78 | } 79 | } 80 | 81 | static void matmul_f32_bf16_sse(float *res, const float *vec, const bf16_t *mat, int len_vec, int y_mat) 82 | { 83 | const bf16_t *m, *m_end = mat + y_mat * len_vec; 84 | for (m=mat; m!=m_end; m+=len_vec) 85 | { 86 | __m128 acc0 = _mm_setzero_ps(); 87 | __m128 acc1 = _mm_setzero_ps(); 88 | __m128 acc2 = _mm_setzero_ps(); 89 | __m128 acc3 = _mm_setzero_ps(); 90 | int i; 91 | for (i=0; i!=len_vec; i+=16) 92 | { 93 | __m128i d0 = _mm_load_si128((__m128i *)(m + i)); 94 | __m128 ps_l0 = _mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), d0)); 95 | __m128 ps_h0 = _mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), d0)); 96 | __m128i d1 = _mm_load_si128((__m128i *)(m + i + 8)); 97 | __m128 ps_l1 = _mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), d1)); 98 | __m128 ps_h1 = _mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), d1)); 99 | acc0 = _mm_fmadd_ps(ps_l0, _mm_load_ps(vec + i ), acc0); 100 | acc1 = _mm_fmadd_ps(ps_h0, _mm_load_ps(vec + i + 4 ), acc1); 101 | acc2 = _mm_fmadd_ps(ps_l1, _mm_load_ps(vec + i + 8 ), acc2); 102 | acc3 = _mm_fmadd_ps(ps_h1, _mm_load_ps(vec + i + 12), acc3); 103 | } 104 | *res++ = hsum_ps_sse_4x(acc0,acc1,acc2,acc3); 105 | } 106 | } 107 | 108 | static void matmul_f32_bf16_avx1(float *res, const float *vec, const bf16_t *mat, int len_vec, int y_mat) 109 | { 110 | const bf16_t *m, *m_end = mat + y_mat * len_vec; 111 | for (m=mat; m!=m_end; m+=len_vec) 112 | { 113 | __m256 acc0 = _mm256_setzero_ps(); 114 | __m256 acc1 = _mm256_setzero_ps(); 115 | 116 | int i; 117 | for (i=0; i!=len_vec; i+=16) 118 | { 119 | __m128i d0 = _mm_load_si128((__m128i *)(m + i )); 120 | __m128i d1 = _mm_load_si128((__m128i *)(m + i + 8)); 121 | acc0 = _mm256_fmadd_ps(GET_8BF16_AVX1(d0), _mm256_load_ps(vec + i ), acc0); 122 | acc1 = _mm256_fmadd_ps(GET_8BF16_AVX1(d1), _mm256_load_ps(vec + i + 8), acc1); 123 | } 124 | *res++ = hsum_ps_avx_2x(acc0, acc1); 125 | } 126 | } 127 | 128 | static void matmul_f32_bf16_avx2(float *res, const float *vec, const bf16_t *mat, int len_vec, int y_mat) 129 | { 130 | const bf16_t *m, *m_end = mat + y_mat * len_vec; 131 | for (m=mat; m!=m_end; m+=len_vec) 132 | { 133 | __m256 acc0 = _mm256_setzero_ps(); 134 | __m256 acc1 = _mm256_setzero_ps(); 135 | 136 | int i; 137 | for (i=0; i!=len_vec; i+=16) 138 | { 139 | __m128i d0 = _mm_load_si128((__m128i *)(m + i )); 140 | __m128i d1 = _mm_load_si128((__m128i *)(m + i + 8)); 141 | acc0 = _mm256_fmadd_ps(GET_8BF16_AVX2(d0), _mm256_load_ps(vec + i ), acc0); 142 | acc1 = _mm256_fmadd_ps(GET_8BF16_AVX2(d1), _mm256_load_ps(vec + i + 8), acc1); 143 | } 144 | *res++ = hsum_ps_avx_2x(acc0, acc1); 145 | } 146 | } 147 | 148 | // init functions list 149 | const matmul_f32_bf16_t matmul_f32_bf16_procs[simd_n] = 150 | { 151 | matmul_f32_bf16_fpu, 152 | matmul_f32_bf16_sse, 153 | matmul_f32_bf16_avx1, 154 | matmul_f32_bf16_avx2 155 | }; 156 | -------------------------------------------------------------------------------- /src/matmul/tr_opt_simd.c: -------------------------------------------------------------------------------- 1 | // simd optimized head attention for transformer 2 | 3 | #ifdef USE_SA_SIMD 4 | 5 | #include 6 | #include 7 | #include 8 | #include "mm_hsum.h" 9 | #include "transformer.h" 10 | #include "matmul.h" 11 | #include "tr_opt_simd.h" 12 | 13 | static void head_att_opt_fpu(float *xb, int n_tok, float *att, const float *q, const float *k, const float *v, const struct transformer_config_t *p) 14 | { 15 | float att_max = -1e10; // softmax max att value 16 | float att_e_sum = 0; // softmax exp diff sum 17 | int kv_dim = p->kv_dim; 18 | int head_size = p->head_size; 19 | float sqrt_head_size = p->sqrt_head_size; 20 | 21 | int t; 22 | for (t=0; t att_max) 28 | att_max = *a; // softmax max att value 29 | } 30 | 31 | // softmax the scores to get attention weights, from 0..pos inclusively 32 | for (t=0; t 0, accumulate xb 48 | } 49 | } 50 | 51 | // sse 52 | static void head_att_opt_sse(float *xb, int n_tok, float *att, const float *q, const float *k, const float *v, const struct transformer_config_t *p) 53 | { 54 | float att_max = -1e10; // softmax max att value 55 | float att_e_sum = 0; // softmax exp diff sum 56 | int kv_dim = p->kv_dim; 57 | int head_size = p->head_size; 58 | int t; 59 | float sqrt_head_size = p->sqrt_head_size; 60 | 61 | const float *m, *m_end = k + n_tok * kv_dim; 62 | float *a = att; 63 | for (m=k; m!=m_end; m+=kv_dim) 64 | { 65 | __m128 acc = _mm_setzero_ps(); 66 | float r; 67 | int i; 68 | for (i=0; i!=head_size; i+=4) 69 | acc = _mm_fmadd_ps(_mm_load_ps(q + i), _mm_load_ps(m + i), acc); 70 | r = hsum_ps_sse(acc); 71 | *a++ = r; 72 | if (r > att_max) 73 | att_max = r; 74 | } 75 | 76 | // softmax the scores to get attention weights, from 0..pos inclusively 77 | for (t=0; tkv_dim; 110 | int head_size = p->head_size; 111 | int t; 112 | float sqrt_head_size = p->sqrt_head_size; 113 | 114 | const float *m, *m_end = k + n_tok * kv_dim; 115 | float *a = att; 116 | for (m=k; m!=m_end; m+=kv_dim) 117 | { 118 | __m256 acc = _mm256_setzero_ps(); 119 | float r; 120 | int i; 121 | for (i=0; i!=head_size; i+=8) 122 | acc = _mm256_fmadd_ps(_mm256_load_ps(q + i), _mm256_load_ps(m + i), acc); 123 | r = hsum_ps_avx1(acc); 124 | *a++ = r; 125 | if (r > att_max) 126 | att_max = r; 127 | } 128 | 129 | // softmax the scores to get attention weights, from 0..pos inclusively 130 | for (t=0; t= simd_avx1) 162 | head_att_opt = head_att_opt_avx; 163 | else 164 | if (simd_typ == simd_sse) 165 | head_att_opt = head_att_opt_sse; 166 | else 167 | head_att_opt = head_att_opt_fpu; 168 | } 169 | 170 | #endif // USE_SA_SIMD 171 | -------------------------------------------------------------------------------- /run_json/run_llama1.json: -------------------------------------------------------------------------------- 1 | // llm run parameters. note: this file must be saved in utf-8 format 2 | { 3 | // ------------------------------------ 4 | // model identifier 5 | 6 | "model_ident": "llama1", // model type for model specificities, refer to model.c for list 7 | 8 | // ------------------------------------ 9 | // model load 10 | 11 | "model_num_safetensors": 2, // count of .safetensors files in model 12 | "model_path": "D:/llama1_st/7b", // path to .safetensors, config.json 13 | 14 | // name of tokenizer 15 | "tokenizer_name": "", // if empty, model_path/tokenizer.json is used 16 | 17 | // ------------------------------------ 18 | // transformer parameters 19 | 20 | // rope value 21 | "rope_set": 0, // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value) 22 | 23 | // ------------------------------------ 24 | // sampler parameters 25 | 26 | "temperature": 0.7, // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable) 27 | "topp": 0.75, // 0.01 to 0.99: max probability sum of top tokens 28 | "topk": 25, // (integer) limit size of top tokens list 5..200 (0 = disable) 29 | "topp_minp": 0.05, // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 30 | "topp_eos": true, // true: limit topp list size to token with probability >= EOS 31 | "repeat_penalty": 0.05, // 0.0..2.0 repeat penalty (0.0 = disable) 32 | "repeat_penalty_n": 50, // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 33 | "eos_amp": 0.5, // 0.0 to 2.0 amplify eos probability when more than eos_amp_n tokens generated. (0 = disable) 34 | "eos_amp_n": 150, // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 35 | "rand_seed": 1234, // (integer) random seed 36 | 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars. 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²", 39 | 40 | "test_nan_logits": false, // test for NAN at sampling in logits (debug, problem detect) 41 | 42 | // ------------------------------------ 43 | 44 | // model load data conversion 45 | "cvt_sf16": false, // convert model to sf16 (require f16 model) 46 | "cvt_f12": false, // convert model to float12 47 | "cvt_f8": false, // convert model to float8 48 | 49 | // hardware parameters 50 | "num_procs": 12, // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances 51 | "numa_nodes": -1, // <=0: all auto detected. >0: max nodes to use 52 | "simd_mode": -1, // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2 53 | 54 | // run parameters 55 | "run_mode": 0, // 0: generate, 1:chat 56 | "gen_run_steps": -1, // generate mode run steps: <=0: model max context size, >0:user value 57 | "token_eos_str": "", // end of string token (assistant reply end) 58 | "token_eot_str": "", // end of text token (dialog/generate end) 59 | 60 | // tokens display options in chat or generate mode 61 | "tok_disp_raw": false, // true: display special tokens (LF,, etc..) 62 | "tok_disp_split": false, // true: display tokens separated with ',' 63 | "tok_disp_prob": false, // true: display sampling info (add [score + n topp] + ',') 64 | 65 | // ------------------------------------ 66 | // generate mode prompt init 67 | 68 | "gen_mode_prompt": " The explanation for the existence of seasons is", 69 | 70 | // ------------------------------------ 71 | // chat mode config 72 | 73 | // dialog colors (r.g.b format) 74 | "chat_use_colors": true, // use colors for chat 75 | "chat_col_msg": "250.250.250", // messages text color 76 | "chat_col_user": "180.255.180", // user text color (keyboard input) 77 | "chat_col_assistant": "180.180.255", // assistant answer text color 78 | 79 | // forward: define what is displayed when forward user prompt 80 | "fwd_disp_mode": 0, // 0: display nothing, 1:tokens list (test usage) 81 | 82 | // ------------------------------------ 83 | // promp mode: define the method to generate the prompt format 84 | // 0: use model_ident value to select templates defined in chat.c 85 | // 1: user defined templates cm1_xxx.. 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat). 87 | "chat_prompt_mode": 2, // mode = 2 required for llama1 88 | 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2) 90 | "chat_assistant_name": "Llama:", 91 | "chat_user_name": "User:", 92 | 93 | // ------------------------------------ 94 | // init prompt mode 2 (generate mode) 95 | // for mode 2 correct work: 96 | // - ensure names coherence in user_template/sys_prompt/user_name_sw 97 | // - no space at end of cm2_user_name_sw 98 | // - terminate sys prompt to user name 99 | 100 | // templates for mode 2 101 | "cm2_sys_template": " %s", // %s replaced by cm2_sys_prompt, = emit bos 102 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next using keyboard input string 103 | "cm2_user_name_sw": "\nUser:", // user name switch (end template) 104 | 105 | // llama1 generate type chat example 106 | "cm2_sys_prompt": 107 | "Transcript of a dialog, where the User interacts with an assistant named Bob. " 108 | +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n" 109 | +"User: Hello Bob.\n" 110 | +"Bob: Hello. How may I help you today?\n" 111 | +"User:", 112 | "cm2_user_prompt": "What is sizeof(int) value in C ?" 113 | } 114 | -------------------------------------------------------------------------------- /src/utils/numa_w.c: -------------------------------------------------------------------------------- 1 | // numa infos for windows 2 | 3 | #include 4 | #include "l_util.h" // msg_error 5 | #include "mem_alloc.h" 6 | #include "numa.h" 7 | 8 | // -------------------------------------- 9 | // get some processors/numa configuration 10 | // note: processor group not managed, will return only processors in current group (max 64) 11 | 12 | struct numa_inf_t numa = { 0 }; 13 | 14 | void init_numa_info(void) 15 | { 16 | DWORD sz = 0; 17 | uint64_t p_msk = 0; // physical processors mask 18 | int i, j, lc = 0; // not physical processors count (ht) 19 | unsigned char node_plist[MAX_NUMA_NODES][MAX_NUMA_PROCS]; 20 | 21 | if (!GetLogicalProcessorInformation(NULL, &sz)) 22 | { 23 | if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) 24 | { 25 | SYSTEM_LOGICAL_PROCESSOR_INFORMATION *pi, *pi_buff; 26 | pi_buff = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION *)malloc_check(sz); 27 | if (GetLogicalProcessorInformation(pi_buff, &sz)) 28 | { 29 | char *pi_end = (char *)pi_buff + sz; 30 | 31 | // get one physical processor if HT enabled, define p_msk 32 | for (pi = pi_buff; (char *)pi < pi_end; pi++) 33 | { 34 | if (pi->Relationship == RelationProcessorCore) 35 | { 36 | // get one processor in mask 37 | uint64_t m = pi->ProcessorMask; 38 | uint64_t m_p = 1; // physical mask 39 | if (!m) // something is wrong 40 | break; 41 | while (!(m & 1)) 42 | { m >>= 1; m_p <<= 1; } 43 | 44 | // count remaining as HT processor 45 | m >>= 1; // pass first found 46 | while (m) 47 | { lc += m & 1; m >>= 1; } 48 | 49 | if (p_msk & m_p) // already defined ? 50 | break; 51 | p_msk |= m_p; // update global mask for physicals 52 | numa.n_procs++; // cores count 53 | } 54 | } 55 | if ((!p_msk) || ((char *)pi != pi_end)) // a break occured 56 | msg_error("init_numa_info failed (1)"); 57 | 58 | // get processors nodes 59 | for (pi = pi_buff; (char *)pi < pi_end; pi++) 60 | { 61 | if (pi->Relationship == RelationNumaNode) 62 | { 63 | uint64_t m = pi->ProcessorMask; 64 | uint64_t m_p = 1; 65 | int p_id = 0; // proc id 66 | int n_id = pi->NumaNode.NodeNumber; // node id 67 | if (n_id >= MAX_NUMA_NODES) 68 | break; 69 | if (n_id >= numa.n_nodes) 70 | numa.n_nodes = n_id + 1; 71 | while (m) 72 | { 73 | if (m & 1) 74 | { 75 | numa.proc_node[p_id] = n_id; 76 | if (p_msk & m_p) // ignore if HT proc 77 | node_plist[n_id][numa.node_nprocs[n_id]++] = p_id; 78 | } 79 | m >>= 1; 80 | m_p <<= 1; 81 | p_id++; 82 | } 83 | } 84 | } 85 | } 86 | free_check(pi_buff); 87 | } 88 | } 89 | 90 | // main thread 91 | numa.mt_node = numa.proc_node[numa_get_thread_proc()]; 92 | numa.mt_procs = numa.node_nprocs[numa.mt_node]; 93 | 94 | if (!numa.mt_procs) // something is wrong 95 | msg_error("init_numa_info failed (2)"); 96 | 97 | // create sorted procs list for user, with procs for main thread at begin 98 | memcpy(numa.proc_list, node_plist[numa.mt_node], numa.mt_procs); 99 | memset(numa.proc_node, numa.mt_node, numa.mt_procs); 100 | j = numa.mt_procs; 101 | for (i=0; i0:user value ex:10000.0 (llama2 value) 22 | 23 | // ------------------------------------ 24 | // sampler parameters 25 | 26 | "temperature": 0.6, // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable) 27 | "topp": 0.90, // 0.01 to 0.99: max probability sum of top tokens 28 | "topk": 30, // (integer) limit size of top tokens list 5..200 (0 = disable) 29 | "topp_minp": 0.05, // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 30 | "topp_eos": true, // true: limit topp list size to token with probability >= EOS 31 | "repeat_penalty": 0.0, // 0.0..2.0 repeat penalty (0.0 = disable) 32 | "repeat_penalty_n": 50, // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 33 | "eos_amp": 0.5, // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable) 34 | "eos_amp_n": 150, // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 35 | "rand_seed": 1234, // (integer) random seed 36 | 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars. 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²", 39 | 40 | "test_nan_logits": false, // test for NAN at sampling in logits (debug, problem detect) 41 | 42 | // ------------------------------------ 43 | 44 | // model load data conversion 45 | "cvt_sf16": false, // convert model to sf16 (require f16 model) 46 | "cvt_f12": false, // convert model to float12 47 | "cvt_f8": false, // convert model to float8 48 | 49 | // hardware parameters 50 | "num_procs": 12, // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances 51 | "numa_nodes": -1, // <=0: all auto detected. >0: max nodes to use 52 | "simd_mode": -1, // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2 53 | 54 | // run parameters 55 | "run_mode": 0, // 0: generate, 1:chat 56 | "gen_run_steps": -1, // generate mode run steps: <=0: model max context size, >0:user value 57 | "token_eos_str": "", // end of string token (assistant reply end) 58 | "token_eot_str": "", // end of text token (dialog/generate end) 59 | 60 | // tokens display options in chat or generate mode 61 | "tok_disp_raw": false, // true: display special tokens (LF,, etc..) 62 | "tok_disp_split": false, // true: display tokens separated with ',' 63 | "tok_disp_prob": false, // true: display sampling info (add [score + n topp] + ',') 64 | 65 | // ------------------------------------ 66 | // generate mode prompt init 67 | 68 | "gen_mode_prompt": " bool is_prime(int x)\n{", 69 | 70 | // ------------------------------------ 71 | // chat mode config 72 | 73 | // dialog colors (r.g.b format) 74 | "chat_use_colors": true, // use colors for chat 75 | "chat_col_msg": "250.250.250", // messages text color 76 | "chat_col_user": "180.255.180", // user text color (keyboard input) 77 | "chat_col_assistant": "180.180.255", // assistant answer text color 78 | 79 | // forward: define what is displayed when forward user prompt 80 | "fwd_disp_mode": 0, // 0: display nothing, 1:tokens list (test usage) 81 | 82 | // ------------------------------------ 83 | // promp mode: define the method to generate the prompt format 84 | // 0: use model_ident value to select templates defined in chat.c 85 | // 1: user defined templates cm1_xxx.. 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat). 87 | "chat_prompt_mode": 0, 88 | 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2) 90 | "chat_assistant_name": "Llcode:", 91 | "chat_user_name": "User:", 92 | 93 | // ------------------------------------ 94 | // chat_prompt_mode=0 parameters 95 | 96 | "cm0_sys_prompt": "You are a chatbot who can help code.", 97 | "cm0_user_prompt": "What is sizeof(int) value in C ?", 98 | 99 | // ------------------------------------ 100 | // chat_prompt_mode=1 parameters (user defined template) 101 | 102 | // https://huggingface.co/blog/llama2#how-to-prompt-llama-2 103 | "cm1_sys_template": "[INST] <>\n%s\n<>\n\n", // %s replace cm1_sys_prompt 104 | "cm1_user_first_template": "%s [/INST]", // first user template following sys prompt 105 | "cm1_user_template": "[INST] %s [/INST]", // %s replace cm1_user_prompt 106 | "cm1_end_template": "\n", // end of assistant reply template 107 | 108 | "cm1_sys_prompt": "You are a chatbot who can help code.", 109 | "cm1_user_prompt": "What is sizeof(int) value in C ?", 110 | 111 | // ------------------------------------ 112 | // init prompt mode 2 (generate mode) 113 | // for mode 2 correct work: 114 | // - ensure names coherence in user_template/sys_prompt/user_name_sw 115 | // - no space at end of cm2_user_name_sw 116 | // - terminate sys prompt to user name 117 | 118 | // templates for mode 2 119 | "cm2_sys_template": " %s", // %s replaced by cm2_sys_prompt, = emit bos 120 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next using keyboard input string 121 | "cm2_user_name_sw": "\nUser:", // user name switch (end template) 122 | 123 | // llama1 generate type chat example 124 | "cm2_sys_prompt": 125 | "Transcript of a dialog, where the User interacts with an assistant named Bob. " 126 | +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n" 127 | +"User: Hello Bob.\n" 128 | +"Bob: Hello. How may I help you today?\n" 129 | +"User:", 130 | "cm2_user_prompt": "What is sizeof(int) value in C ?" 131 | } 132 | -------------------------------------------------------------------------------- /run_json/run_mathstral.json: -------------------------------------------------------------------------------- 1 | // llm run parameters. note: this file must be saved in utf-8 format 2 | { 3 | // ------------------------------------ 4 | // model identifier 5 | 6 | "model_ident": "mathstral", // model type for model specificities, refer to model.c for list 7 | 8 | // ------------------------------------ 9 | // model load 10 | 11 | "model_num_safetensors": 6, // count of .safetensors files 12 | "model_path": "E:/mathstral/Mathstral-7B-v0.1", // path to .safetensors, config.json 13 | 14 | // name of tokenizer 15 | "tokenizer_name": "", // if empty, model_path/tokenizer.json is used 16 | 17 | // ------------------------------------ 18 | // transformer parameters 19 | 20 | // rope value 21 | "rope_set": 0, // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value) 22 | 23 | // ------------------------------------ 24 | // sampler parameters 25 | 26 | "temperature": 0.7, // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable) 27 | "topp": 0.80, // 0.01 to 0.99: max probability sum of top tokens 28 | "topk": 40, // (integer) limit size of top tokens list 5..200 (0 = disable) 29 | "topp_minp": 0.05, // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 30 | "topp_eos": true, // true: limit topp list size to token with probability >= EOS 31 | "repeat_penalty": 0.05, // 0.0..2.0 repeat penalty (0.0 = disable) 32 | "repeat_penalty_n": 50, // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 33 | "eos_amp": 0.5, // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable) 34 | "eos_amp_n": 150, // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 35 | "rand_seed": 1234, // (integer) random seed 36 | 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars. 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²", 39 | 40 | "test_nan_logits": false, // test for NAN at sampling in logits (debug, problem detect) 41 | 42 | // ------------------------------------ 43 | 44 | // model load data conversion 45 | "cvt_sf16": false, // convert model to sf16 (require f16 model) 46 | "cvt_f12": false, // convert model to float12 47 | "cvt_f8": false, // convert model to float8 48 | 49 | // hardware parameters 50 | "num_procs": 12, // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances 51 | "numa_nodes": -1, // <=0: all auto detected. >0: max nodes to use 52 | "simd_mode": -1, // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2 53 | 54 | // run parameters 55 | "run_mode": 0, // 0: generate, 1:chat 56 | "gen_run_steps": -1, // generate mode run steps: <=0: model max context size, >0:user value 57 | "token_eos_str": "", // end of string token (assistant reply end) 58 | "token_eot_str": "", // end of text token (dialog/generate end) 59 | 60 | // tokens display options in chat or generate mode 61 | "tok_disp_raw": false, // true: display special tokens (LF,, etc..) 62 | "tok_disp_split": false, // true: display tokens separated with ',' 63 | "tok_disp_prob": false, // true: display sampling info (add [score + n topp] + ',') 64 | 65 | // ------------------------------------ 66 | // generate mode prompt init 67 | 68 | "gen_mode_prompt": " The explanation for the existence of seasons is", 69 | 70 | // ------------------------------------ 71 | // chat mode config 72 | 73 | // dialog colors (r.g.b format) 74 | "chat_use_colors": true, // use colors for chat 75 | "chat_col_msg": "250.250.250", // messages text color 76 | "chat_col_user": "180.255.180", // user text color (keyboard input) 77 | "chat_col_assistant": "180.180.255", // assistant answer text color 78 | 79 | // forward: define what is displayed when forward user prompt 80 | "fwd_disp_mode": 0, // 0: display nothing, 1:tokens list (test usage) 81 | 82 | // ------------------------------------ 83 | // promp mode: define the method to generate the prompt format 84 | // 0: use model_ident value to select templates defined in chat.c 85 | // 1: user defined templates cm1_xxx.. 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat). 87 | "chat_prompt_mode": 0, 88 | 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2) 90 | "chat_assistant_name": "Mathstral:", 91 | "chat_user_name": "User:", 92 | 93 | // ------------------------------------ 94 | // chat_prompt_mode=0 parameters 95 | 96 | "cm0_sys_prompt": "", // todo, no sys prompt with instruct model ? 97 | "cm0_user_prompt": "What is a pointer in C language ?", 98 | 99 | // ------------------------------------ 100 | // chat_prompt_mode=1 parameters (user defined template) 101 | 102 | // todo: find prompt documentation. 103 | "cm1_sys_template": "", // %s replace cm1_sys_prompt 104 | "cm1_user_first_template": "", // first user template following sys prompt 105 | "cm1_user_template": "[INST] %s [/INST]", // %s replace cm1_user_prompt 106 | "cm1_end_template": "\n", // end of assistant reply template 107 | 108 | "cm1_sys_prompt": "", // no sys prompt with Mistral ? 109 | "cm1_user_prompt": "What is sizeof(int) value in C ?", 110 | 111 | // ------------------------------------ 112 | // init prompt mode 2 (generate mode) 113 | // for mode 2 correct work: 114 | // - ensure names coherence in user_template/sys_prompt/user_name_sw 115 | // - no space at end of cm2_user_name_sw 116 | // - terminate sys prompt to user name 117 | 118 | // templates for mode 2 119 | "cm2_sys_template": " %s", // %s replaced by cm2_sys_prompt, = emit bos 120 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next using keyboard input string 121 | "cm2_user_name_sw": "\nUser:", // user name switch (end template) 122 | 123 | // llama1 generate type chat example 124 | "cm2_sys_prompt": 125 | "Transcript of a dialog, where the User interacts with an assistant named Bob. " 126 | +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n" 127 | +"User: Hello Bob.\n" 128 | +"Bob: Hello. How may I help you today?\n" 129 | +"User:", 130 | "cm2_user_prompt": "What is sizeof(int) value in C ?" 131 | } 132 | -------------------------------------------------------------------------------- /run_json/run_mistral.json: -------------------------------------------------------------------------------- 1 | // llm run parameters. note: this file must be saved in utf-8 format 2 | { 3 | // ------------------------------------ 4 | // model identifier 5 | 6 | "model_ident": "mistral", // model type for model specificities, refer to model.c for list 7 | 8 | // ------------------------------------ 9 | // model load 10 | 11 | "model_num_safetensors": 3, // count of .safetensors files 12 | "model_path": "E:/mistral/mistral-7b-instruct-v0.3", // path to .safetensors, config.json 13 | 14 | // name of tokenizer 15 | "tokenizer_name": "", // if empty, model_path/tokenizer.json is used 16 | 17 | // ------------------------------------ 18 | // transformer parameters 19 | 20 | // rope value 21 | "rope_set": 0, // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value) 22 | 23 | // ------------------------------------ 24 | // sampler parameters 25 | 26 | "temperature": 0.7, // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable) 27 | "topp": 0.80, // 0.01 to 0.99: max probability sum of top tokens 28 | "topk": 40, // (integer) limit size of top tokens list 5..200 (0 = disable) 29 | "topp_minp": 0.05, // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 30 | "topp_eos": true, // true: limit topp list size to token with probability >= EOS 31 | "repeat_penalty": 0.05, // 0.0..2.0 repeat penalty (0.0 = disable) 32 | "repeat_penalty_n": 50, // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 33 | "eos_amp": 0.5, // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable) 34 | "eos_amp_n": 150, // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 35 | "rand_seed": 1234, // (integer) random seed 36 | 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars. 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²", 39 | 40 | "test_nan_logits": false, // test for NAN at sampling in logits (debug, problem detect) 41 | 42 | // ------------------------------------ 43 | 44 | // model load data conversion 45 | "cvt_sf16": false, // convert model to sf16 (require f16 model) 46 | "cvt_f12": false, // convert model to float12 47 | "cvt_f8": false, // convert model to float8 48 | 49 | // hardware parameters 50 | "num_procs": 12, // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances 51 | "numa_nodes": -1, // <=0: all auto detected. >0: max nodes to use 52 | "simd_mode": -1, // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2 53 | 54 | // run parameters 55 | "run_mode": 0, // 0: generate, 1:chat 56 | "gen_run_steps": -1, // generate mode run steps: <=0: model max context size, >0:user value 57 | "token_eos_str": "", // end of string token (assistant reply end) 58 | "token_eot_str": "", // end of text token (dialog/generate end) 59 | 60 | // tokens display options in chat or generate mode 61 | "tok_disp_raw": false, // true: display special tokens (LF,, etc..) 62 | "tok_disp_split": false, // true: display tokens separated with ',' 63 | "tok_disp_prob": false, // true: display sampling info (add [score + n topp] + ',') 64 | 65 | // ------------------------------------ 66 | // generate mode prompt init 67 | 68 | "gen_mode_prompt": " The explanation for the existence of seasons is", 69 | 70 | // ------------------------------------ 71 | // chat mode config 72 | 73 | // dialog colors (r.g.b format) 74 | "chat_use_colors": true, // use colors for chat 75 | "chat_col_msg": "250.250.250", // messages text color 76 | "chat_col_user": "180.255.180", // user text color (keyboard input) 77 | "chat_col_assistant": "180.180.255", // assistant answer text color 78 | 79 | // forward: define what is displayed when forward user prompt 80 | "fwd_disp_mode": 0, // 0: display nothing, 1:tokens list (test usage) 81 | 82 | // ------------------------------------ 83 | // promp mode: define the method to generate the prompt format 84 | // 0: use model_ident value to select templates defined in chat.c 85 | // 1: user defined templates cm1_xxx.. 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat). 87 | "chat_prompt_mode": 0, 88 | 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2) 90 | "chat_assistant_name": "Mistral:", 91 | "chat_user_name": "User:", 92 | 93 | // ------------------------------------ 94 | // chat_prompt_mode=0 parameters 95 | 96 | "cm0_sys_prompt": "", // todo, no sys prompt with instruct model ? 97 | "cm0_user_prompt": "What is a pointer in C language ?", 98 | 99 | // ------------------------------------ 100 | // chat_prompt_mode=1 parameters (user defined template) 101 | 102 | // todo: find prompt documentation. 103 | "cm1_sys_template": "", // %s replace cm1_sys_prompt 104 | "cm1_user_first_template": "", // first user template following sys prompt 105 | "cm1_user_template": "[INST] %s [/INST]", // %s replace cm1_user_prompt 106 | "cm1_end_template": "\n", // end of assistant reply template 107 | 108 | "cm1_sys_prompt": "", // no sys prompt with Mistral ? 109 | "cm1_user_prompt": "What is sizeof(int) value in C ?", 110 | 111 | // ------------------------------------ 112 | // init prompt mode 2 (generate mode) 113 | // for mode 2 correct work: 114 | // - ensure names coherence in user_template/sys_prompt/user_name_sw 115 | // - no space at end of cm2_user_name_sw 116 | // - terminate sys prompt to user name 117 | 118 | // templates for mode 2 119 | "cm2_sys_template": " %s", // %s replaced by cm2_sys_prompt, = emit bos 120 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next using keyboard input string 121 | "cm2_user_name_sw": "\nUser:", // user name switch (end template) 122 | 123 | // llama1 generate type chat example 124 | "cm2_sys_prompt": 125 | "Transcript of a dialog, where the User interacts with an assistant named Bob. " 126 | +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n" 127 | +"User: Hello Bob.\n" 128 | +"Bob: Hello. How may I help you today?\n" 129 | +"User:", 130 | "cm2_user_prompt": "What is sizeof(int) value in C ?" 131 | } 132 | -------------------------------------------------------------------------------- /run_json/run_tinyllama.json: -------------------------------------------------------------------------------- 1 | // llm run parameters. note: this file must be saved in utf-8 format 2 | { 3 | // ------------------------------------ 4 | // model identifier 5 | 6 | "model_ident": "tinyllama", // model type for model specificities, refer to model.c for list 7 | 8 | // ------------------------------------ 9 | // model load 10 | 11 | "model_num_safetensors": 1, // count of .safetensors files in model 12 | "model_path": "D:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0", // path to .safetensors, config.json 13 | 14 | // name of tokenizer 15 | "tokenizer_name": "", // if empty, model_path/tokenizer.json is used 16 | 17 | // ------------------------------------ 18 | // transformer parameters 19 | 20 | // rope value 21 | "rope_set": 0, // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value) 22 | 23 | // ------------------------------------ 24 | // sampler parameters 25 | 26 | "temperature": 0.6, // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable) 27 | "topp": 0.65, // 0.01 to 0.99: max probability sum of top tokens 28 | "topk": 25, // (integer) limit size of top tokens list 5..200 (0 = disable) 29 | "topp_minp": 0.05, // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 30 | "topp_eos": true, // true: limit topp list size to token with probability >= EOS 31 | "repeat_penalty": 0.05, // 0.0..2.0 repeat penalty (0.0 = disable) 32 | "repeat_penalty_n": 50, // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 33 | "eos_amp": 0.5, // 0.0 to 2.0 amplify eos probability when more than eos_amp_n tokens generated. (0 = disable) 34 | "eos_amp_n": 150, // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 35 | "rand_seed": 1234, // (integer) random seed 36 | 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars. 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²", 39 | 40 | "test_nan_logits": false, // test for NAN at sampling in logits (debug, problem detect) 41 | 42 | // ------------------------------------ 43 | 44 | // model load data conversion 45 | "cvt_sf16": false, // convert model to sf16 (require f16 model) 46 | "cvt_f12": false, // convert model to float12 47 | "cvt_f8": false, // convert model to float8 (cannot with tinyllama) 48 | 49 | // hardware parameters 50 | "num_procs": 12, // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances 51 | "numa_nodes": -1, // <=0: all auto detected. >0: max nodes to use 52 | "simd_mode": -1, // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2 53 | 54 | // run parameters 55 | "run_mode": 0, // 0: generate, 1:chat 56 | "gen_run_steps": -1, // generate mode run steps: <=0: model max context size, >0:user value 57 | "token_eos_str": "", // end of string token (assistant reply end) 58 | "token_eot_str": "", // end of text token (dialog/generate end) 59 | 60 | // tokens display options in chat or generate mode 61 | "tok_disp_raw": false, // true: display special tokens (LF,, etc..) 62 | "tok_disp_split": false, // true: display tokens separated with ',' 63 | "tok_disp_prob": false, // true: display sampling info (add [score + n topp] + ',') 64 | 65 | // ------------------------------------ 66 | // generate mode prompt init 67 | 68 | "gen_mode_prompt": " The explanation for the existence of seasons is", 69 | 70 | // ------------------------------------ 71 | // chat mode config 72 | 73 | // dialog colors (r.g.b format) 74 | "chat_use_colors": true, // use colors for chat 75 | "chat_col_msg": "250.250.250", // messages text color 76 | "chat_col_user": "180.255.180", // user text color (keyboard input) 77 | "chat_col_assistant": "180.180.255", // assistant answer text color 78 | 79 | // forward: define what is displayed when forward user prompt 80 | "fwd_disp_mode": 0, // 0: display nothing, 1:tokens list (test usage) 81 | 82 | // ------------------------------------ 83 | // promp mode: define the method to generate the prompt format 84 | // 0: use model_ident value to select templates defined in chat.c 85 | // 1: user defined templates cm1_xxx.. 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat). 87 | "chat_prompt_mode": 0, 88 | 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2) 90 | "chat_assistant_name": "Tiny:", 91 | "chat_user_name": "User:", 92 | 93 | // ------------------------------------ 94 | // chat_prompt_mode=0 parameters 95 | 96 | "cm0_sys_prompt": "You are a chatbot who can help code.", 97 | "cm0_user_prompt": "What is sizeof(int) value in C ?", 98 | 99 | // ------------------------------------ 100 | // chat_prompt_mode=1 parameters (user defined template) 101 | 102 | // https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0 103 | "cm1_sys_template": "<|system|>\n%s\n", // %s replace cm1_sys_prompt 104 | "cm1_user_first_template": "", // first user template following sys prompt 105 | "cm1_user_template": "<|user|>\n%s\n<|assistant|>\n", // %s replace cm1_user_prompt 106 | "cm1_end_template": "\n", // end of assistant reply template 107 | 108 | "cm1_sys_prompt": "You are a chatbot who can help code.", 109 | "cm1_user_prompt": "What is sizeof(int) value in C ?", 110 | 111 | // ------------------------------------ 112 | // init prompt mode 2 (generate mode) 113 | // for mode 2 correct work: 114 | // - ensure names coherence in user_template/sys_prompt/user_name_sw 115 | // - no space at end of cm2_user_name_sw 116 | // - terminate sys prompt to user name 117 | 118 | // templates for mode 2 119 | "cm2_sys_template": " %s", // %s replaced by cm2_sys_prompt, = emit bos 120 | "cm2_user_template": " %s\nTiny:", // %s = cm2_user_prompt at init and next using keyboard input string 121 | "cm2_user_name_sw": "\nUser:", // user name switch (end template) 122 | 123 | // llama1 generate type chat example 124 | "cm2_sys_prompt": 125 | "Transcript of a dialog, where the User interacts with an assistant named Tiny. " 126 | +"Tiny is good at computer programming and never fails to respond to user requests accurately.\n\n" 127 | +"User: Hello Tiny.\n" 128 | +"Tiny: Hello. How may I help you today?\n" 129 | +"User:", 130 | "cm2_user_prompt": "What is sizeof(int) value in C ?" 131 | } 132 | -------------------------------------------------------------------------------- /run_json/run_zephyr.json: -------------------------------------------------------------------------------- 1 | // llm run parameters. note: this file must be saved in utf-8 format 2 | { 3 | // ------------------------------------ 4 | // model identifier 5 | 6 | "model_ident": "zephyr", // model type for model specificities, refer to model.c for list 7 | 8 | // ------------------------------------ 9 | // model load 10 | 11 | "model_num_safetensors": 8, // count of .safetensors files 12 | "model_path": "E:/zephyr/zephyr-7b-beta", // path to .safetensors, config.json 13 | 14 | // name of tokenizer 15 | "tokenizer_name": "", // if empty, model_path/tokenizer.json is used 16 | 17 | // ------------------------------------ 18 | // transformer parameters 19 | 20 | // rope value 21 | "rope_set": 0, // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value) 22 | 23 | // ------------------------------------ 24 | // sampler parameters 25 | 26 | "temperature": 0.7, // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable) 27 | "topp": 0.80, // 0.01 to 0.99: max probability sum of top tokens 28 | "topk": 40, // (integer) limit size of top tokens list 5..200 (0 = disable) 29 | "topp_minp": 0.05, // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 30 | "topp_eos": true, // true: limit topp list size to token with probability >= EOS 31 | "repeat_penalty": 0.05, // 0.0..2.0 repeat penalty (0.0 = disable) 32 | "repeat_penalty_n": 50, // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 33 | "eos_amp": 0.5, // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable) 34 | "eos_amp_n": 150, // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 35 | "rand_seed": 1234, // (integer) random seed 36 | 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars. 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²", 39 | 40 | "test_nan_logits": false, // test for NAN at sampling in logits (debug, problem detect) 41 | 42 | // ------------------------------------ 43 | 44 | // model load data conversion 45 | "cvt_sf16": false, // convert model to sf16 (require f16 model) 46 | "cvt_f12": false, // convert model to float12 47 | "cvt_f8": false, // convert model to float8 48 | 49 | // hardware parameters 50 | "num_procs": 12, // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances 51 | "numa_nodes": -1, // <=0: all auto detected. >0: max nodes to use 52 | "simd_mode": -1, // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2 53 | 54 | // run parameters 55 | "run_mode": 0, // 0: generate, 1:chat 56 | "gen_run_steps": -1, // generate mode run steps: <=0: model max context size, >0:user value 57 | "token_eos_str": "", // end of string token (assistant reply end) 58 | "token_eot_str": "", // end of text token (dialog/generate end) 59 | 60 | // tokens display options in chat or generate mode 61 | "tok_disp_raw": false, // true: display special tokens (LF,, etc..) 62 | "tok_disp_split": false, // true: display tokens separated with ',' 63 | "tok_disp_prob": false, // true: display sampling info (add [score + n topp] + ',') 64 | 65 | // ------------------------------------ 66 | // generate mode prompt init 67 | 68 | "gen_mode_prompt": " The explanation for the existence of seasons is", 69 | 70 | // ------------------------------------ 71 | // chat mode config 72 | 73 | // dialog colors (r.g.b format) 74 | "chat_use_colors": true, // use colors for chat 75 | "chat_col_msg": "250.250.250", // messages text color 76 | "chat_col_user": "180.255.180", // user text color (keyboard input) 77 | "chat_col_assistant": "180.180.255", // assistant answer text color 78 | 79 | // forward: define what is displayed when forward user prompt 80 | "fwd_disp_mode": 0, // 0: display nothing, 1:tokens list (test usage) 81 | 82 | // ------------------------------------ 83 | // promp mode: define the method to generate the prompt format 84 | // 0: use model_ident value to select templates defined in chat.c 85 | // 1: user defined templates cm1_xxx.. 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat). 87 | "chat_prompt_mode": 0, 88 | 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2) 90 | "chat_assistant_name": "Zephyr:", 91 | "chat_user_name": "User:", 92 | 93 | // ------------------------------------ 94 | // chat_prompt_mode=0 parameters 95 | 96 | "cm0_sys_prompt": "", // todo, no sys prompt with instruct model ? 97 | "cm0_user_prompt": "What is a pointer in C language ?", 98 | 99 | // ------------------------------------ 100 | // chat_prompt_mode=1 parameters (user defined template) 101 | 102 | // https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha 103 | "cm1_sys_template": "<|system|>\n%s\n", // %s replace cm1_sys_prompt 104 | "cm1_user_first_template": "", // first user template following sys prompt 105 | "cm1_user_template": "<|user|>\n%s\n<|assistant|>\n", // %s replace cm1_user_prompt 106 | "cm1_end_template": "\n", // end of assistant reply template 107 | 108 | "cm1_sys_prompt": "", // no sys prompt with Mistral ? 109 | "cm1_user_prompt": "What is sizeof(int) value in C ?", 110 | 111 | // ------------------------------------ 112 | // init prompt mode 2 (generate mode) 113 | // for mode 2 correct work: 114 | // - ensure names coherence in user_template/sys_prompt/user_name_sw 115 | // - no space at end of cm2_user_name_sw 116 | // - terminate sys prompt to user name 117 | 118 | // templates for mode 2 119 | "cm2_sys_template": " %s", // %s replaced by cm2_sys_prompt, = emit bos 120 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next using keyboard input string 121 | "cm2_user_name_sw": "\nUser:", // user name switch (end template) 122 | 123 | // llama1 generate type chat example 124 | "cm2_sys_prompt": 125 | "Transcript of a dialog, where the User interacts with an assistant named Bob. " 126 | +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n" 127 | +"User: Hello Bob.\n" 128 | +"Bob: Hello. How may I help you today?\n" 129 | +"User:", 130 | "cm2_user_prompt": "What is sizeof(int) value in C ?" 131 | } 132 | -------------------------------------------------------------------------------- /src/matmul/matmul_sf16.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "mm_hsum.h" 3 | #include "w_types.h" 4 | #include "matmul.h" 5 | #include "matmul_priv.h" 6 | 7 | // -------------------------------- 8 | // float SF16 conversions 9 | 10 | #define SF16_CVT_MSK 0xfffc7fff 11 | #define SF16_CVT_LSL 13 12 | #define SF16_CVT_ADD 0x18800 // sf16 range 1.8626451e-009 to 7.9960938 13 | 14 | #define SF16_CVT_MAX 8.0f // max +/- converted value 15 | #define SF16_ERR_MAX 0.0039062f // max convert error for SF16_CVT_MAX value 16 | 17 | #define F16_8_00 18432 // 8.00 in float 16 18 | #define F16_TO_SF16_MAX F16_8_00 // max F16 value that can be converted to SF16 19 | 20 | // ------------------------------------------------------------------ 21 | // conversion sf16 => f32 22 | // ------------------------------------------------------------------ 23 | 24 | static void cvt_sf16_to_f32_fpu(float *f32, const sf16_t *sf16, size_t ne) 25 | { 26 | int *ps = (int *)f32; 27 | size_t i; 28 | for (i=0; i f32 64 | // ------------------------------------------------------------------ 65 | 66 | static void matmul_f32_sf16_fpu(float *res, const float *vec, const sf16_t *mat, int len_vec, int y_mat) 67 | { 68 | const sf16_t *m, *m_end = mat + y_mat * len_vec; 69 | for (m=mat; m!=m_end; m+=len_vec) 70 | { 71 | float acc = 0; 72 | int i; 73 | for (i=0; i!=len_vec; i++) 74 | { 75 | unsigned int f32i = (((short)m[i] & SF16_CVT_MSK) + SF16_CVT_ADD) << SF16_CVT_LSL; 76 | acc += vec[i] * *(float *)&f32i; 77 | } 78 | *res++ = acc; 79 | } 80 | } 81 | 82 | static void matmul_f32_sf16_sse(float *res, const float *vec, const sf16_t *mat, int len_vec, int y_mat) 83 | { 84 | const sf16_t *m, *m_end = mat + y_mat * len_vec; 85 | for (m=mat; m!=m_end; m+=len_vec) 86 | { 87 | __m128 acc0 = _mm_setzero_ps(); 88 | __m128 acc1 = _mm_setzero_ps(); 89 | __m128 acc2 = _mm_setzero_ps(); 90 | __m128 acc3 = _mm_setzero_ps(); 91 | int i; 92 | for (i=0; i!=len_vec; i+=16) 93 | { 94 | acc0 = _mm_fmadd_ps(_mm_load_ps(vec + i ), CVT_4SF16(_mm_loadl_epi64((__m128i *)(m + i ))), acc0); 95 | acc1 = _mm_fmadd_ps(_mm_load_ps(vec + i + 4), CVT_4SF16(_mm_loadl_epi64((__m128i *)(m + i + 4))), acc1); 96 | acc2 = _mm_fmadd_ps(_mm_load_ps(vec + i + 8), CVT_4SF16(_mm_loadl_epi64((__m128i *)(m + i + 8))), acc2); 97 | acc3 = _mm_fmadd_ps(_mm_load_ps(vec + i + 12), CVT_4SF16(_mm_loadl_epi64((__m128i *)(m + i + 12))), acc3); 98 | } 99 | *res++ = hsum_ps_sse_4x(acc0,acc1,acc2,acc3); 100 | } 101 | } 102 | 103 | static void matmul_f32_sf16_avx2(float *res, const float *vec, const sf16_t *mat, int len_vec, int y_mat) 104 | { 105 | const sf16_t *m, *m_end = mat + y_mat * len_vec; 106 | for (m=mat; m!=m_end; m+=len_vec) 107 | { 108 | __m256 acc = _mm256_setzero_ps(); 109 | int i; 110 | for (i=0; i!=len_vec; i+=8) 111 | acc = _mm256_fmadd_ps(_mm256_load_ps(vec + i), CVT_8SF16(_mm_load_si128((__m128i *)(m + i))), acc); 112 | *res++ = hsum_ps_avx1(acc); 113 | } 114 | } 115 | 116 | // init functions list 117 | const matmul_f32_sf16_t matmul_f32_sf16_procs[simd_n] = 118 | { 119 | matmul_f32_sf16_fpu, 120 | matmul_f32_sf16_sse, 121 | NULL, 122 | matmul_f32_sf16_avx2, 123 | }; 124 | 125 | // ------------------------------------------------------------------ 126 | // SF16 conversions 127 | // ------------------------------------------------------------------ 128 | 129 | #include "l_util.h" 130 | #include "mem_alloc.h" 131 | 132 | // lut to convert model weights 133 | static sf16_t lut_f16_to_sf16[N_64K] = { 0 }; 134 | 135 | // f32 to sf16 (using e_ofs = 98) 136 | static int f32_to_sf16(float f32) 137 | { 138 | int a = *(int *)&f32; 139 | int e = (a >> 23) & 0xff; 140 | int m = (a >> (23 - 10)) & ((1 << 10) - 1); 141 | int f16 = m + ((e - 98) << 10); 142 | return f16; 143 | } 144 | 145 | // init lookup table. 146 | void init_conv_sf16(void) 147 | { 148 | // alloc temporary AVX aligned arrays 149 | VAR_ALLOC(f16_list, f16_t, N_64K/2); 150 | VAR_ALLOC(f16_to_f32, float, N_64K/2); 151 | int i; 152 | 153 | for (i=0; i= (N_64K/2)) k = (N_64K/2)-1; 166 | 167 | // note: there is no rounding required, except for F16 0.0 replaced 168 | // by +/-1.8626451e-009, all other values < SF16_TO_F16_MAX match exactly. 169 | lut_f16_to_sf16[i] = k; 170 | lut_f16_to_sf16[i+(N_64K/2)] = 0x8000 | k; 171 | } 172 | free_check(f16_to_f32); 173 | free_check(f16_list); 174 | } 175 | 176 | // convert buffer f16 to sf16 177 | void cvt_f16_to_sf16(sf16_t *sf16, const f16_t *f16, size_t ne) 178 | { 179 | const f16_t *f16_end = f16 + ne; 180 | while (f16 < f16_end) 181 | { 182 | f16_t _a = *f16++; 183 | if (ABS_F16(_a) > F16_TO_SF16_MAX) // check _a can be converted with small error 184 | msg_error("conversion F16 to SF16 out of range"); 185 | *sf16++ = lut_f16_to_sf16[_a]; 186 | } 187 | } -------------------------------------------------------------------------------- /run_json/run_llama2.json: -------------------------------------------------------------------------------- 1 | // llm run parameters. note: this file must be saved in utf-8 format 2 | { 3 | // ------------------------------------ 4 | // model identifier 5 | 6 | "model_ident": "llama2", // model type for model specificities, refer to model.c for list 7 | 8 | // ------------------------------------ 9 | // model load 10 | 11 | "model_num_safetensors": 2, // count of .safetensors files in model 12 | "model_path": "E:/llama2/llama2-7b-chat-hf", // path to .safetensors, config.json 13 | 14 | /* 15 | // test llama2 8B pro 16 | "model_num_safetensors": 2, 17 | "model_path": "E:/llama2_pro/8b_pro_instruct", 18 | */ 19 | 20 | // name of tokenizer 21 | "tokenizer_name": "", // if empty, model_path/tokenizer.json is used 22 | 23 | // ------------------------------------ 24 | // transformer parameters 25 | 26 | // rope value 27 | "rope_set": 0, // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value) 28 | 29 | // ------------------------------------ 30 | // sampler parameters 31 | 32 | "temperature": 0.9, // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable) 33 | "topp": 0.80, // 0.01 to 0.99: max probability sum of top tokens 34 | "topk": 40, // (integer) limit size of top tokens list 5..200 (0 = disable) 35 | "topp_minp": 0.05, // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 36 | "topp_eos": true, // true: limit topp list size to token with probability >= EOS 37 | "repeat_penalty": 0.05, // 0.0..2.0 repeat penalty (0.0 = disable) 38 | "repeat_penalty_n": 50, // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 39 | "eos_amp": 0.5, // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable) 40 | "eos_amp_n": 150, // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 41 | "rand_seed": 1234, // (integer) random seed 42 | 43 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars. 44 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²", 45 | 46 | "test_nan_logits": false, // test for NAN at sampling in logits (debug, problem detect) 47 | 48 | // ------------------------------------ 49 | 50 | // model load data conversion 51 | "cvt_sf16": false, // convert model to sf16 (require f16 model) 52 | "cvt_f12": false, // convert model to float12 53 | "cvt_f8": false, // convert model to float8 54 | 55 | // hardware parameters 56 | "num_procs": -1, // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances 57 | "numa_nodes": -1, // <=0: all auto detected. >0: max nodes to use 58 | "simd_mode": -1, // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2 59 | 60 | // run parameters 61 | "run_mode": 0, // 0: generate, 1:chat 62 | "gen_run_steps": -1, // generate mode run steps: <=0: model max context size, >0:user value 63 | "token_eos_str": "", // end of string token (assistant reply end) 64 | "token_eot_str": "", // end of text token (dialog/generate end) 65 | 66 | // tokens display options in chat or generate mode 67 | "tok_disp_raw": false, // true: display special tokens (LF,, etc..) 68 | "tok_disp_split": false, // true: display tokens separated with ',' 69 | "tok_disp_prob": false, // true: display sampling info (add [score + n topp] + ',') 70 | 71 | // ------------------------------------ 72 | // generate mode prompt init 73 | 74 | "gen_mode_prompt": " The explanation for the existence of seasons is", 75 | 76 | // ------------------------------------ 77 | // chat mode config 78 | 79 | // dialog colors (r.g.b format) 80 | "chat_use_colors": true, // use colors for chat 81 | "chat_col_msg": "250.250.250", // messages text color 82 | "chat_col_user": "180.255.180", // user text color (keyboard input) 83 | "chat_col_assistant": "180.180.255", // assistant answer text color 84 | 85 | // forward: define what is displayed when forward user prompt 86 | "fwd_disp_mode": 0, // 0: display nothing, 1:tokens list (test usage) 87 | 88 | // ------------------------------------ 89 | // promp mode: define the method to generate the prompt format 90 | // 0: use model_ident value to select templates defined in chat.c 91 | // 1: user defined templates cm1_xxx.. 92 | // 2: use generate mode (llama1), should work with any models (chat/non-chat). 93 | "chat_prompt_mode": 0, 94 | 95 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2) 96 | "chat_assistant_name": "Llama2:", 97 | "chat_user_name": "User:", 98 | 99 | // ------------------------------------ 100 | // chat_prompt_mode=0 parameters 101 | 102 | "cm0_sys_prompt": "You are a chatbot who can help code.", 103 | "cm0_user_prompt": "What is sizeof(int) value in C ?", 104 | 105 | // ------------------------------------ 106 | // chat_prompt_mode=1 parameters (user defined template) 107 | 108 | // https://huggingface.co/blog/llama2#how-to-prompt-llama-2 109 | "cm1_sys_template": "[INST] <>\n%s\n<>\n\n", // %s replace cm1_sys_prompt 110 | "cm1_user_first_template": "%s [/INST]", // first user template following sys prompt 111 | "cm1_user_template": "[INST] %s [/INST]", // %s replace cm1_user_prompt 112 | "cm1_end_template": "\n", // end of assistant reply template 113 | 114 | "cm1_sys_prompt": "You are a chatbot who can help code.", 115 | "cm1_user_prompt": "What is sizeof(int) value in C ?", 116 | 117 | // ------------------------------------ 118 | // init prompt mode 2 (generate mode) 119 | // for mode 2 correct work: 120 | // - ensure names coherence in user_template/sys_prompt/user_name_sw 121 | // - no space at end of cm2_user_name_sw 122 | // - terminate sys prompt to user name 123 | 124 | // templates for mode 2 125 | "cm2_sys_template": " %s", // %s replaced by cm2_sys_prompt, = emit bos 126 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next using keyboard input string 127 | "cm2_user_name_sw": "\nUser:", // user name switch (end template) 128 | 129 | // llama1 generate type chat example 130 | "cm2_sys_prompt": 131 | "Transcript of a dialog, where the User interacts with an assistant named Bob. " 132 | +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n" 133 | +"User: Hello Bob.\n" 134 | +"Bob: Hello. How may I help you today?\n" 135 | +"User:", 136 | "cm2_user_prompt": "What is sizeof(int) value in C ?" 137 | } 138 | -------------------------------------------------------------------------------- /run_json/run_mixtral.json: -------------------------------------------------------------------------------- 1 | // llm run parameters. note: this file must be saved in utf-8 format 2 | { 3 | // ------------------------------------ 4 | // model identifier 5 | 6 | "model_ident": "mixtral", // model type for model specificities, refer to model.c for list 7 | 8 | // ------------------------------------ 9 | // model load 10 | 11 | "model_num_safetensors": 19, // count of .safetensors files 12 | // "model_path": "E:/mixtral/Mixtral-8x7B-Instruct-v0.1", // path to .safetensors, config.json 13 | "model_path": "D:/mixtral/Mixtral-8x7B-Instruct-v0.1", // path to .safetensors, config.json 14 | 15 | // name of tokenizer 16 | "tokenizer_name": "", // if empty, model_path/tokenizer.json is used 17 | 18 | // ------------------------------------ 19 | // transformer parameters 20 | 21 | // rope value 22 | "rope_set": 0, // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value) 23 | 24 | // ------------------------------------ 25 | // sampler parameters 26 | 27 | "temperature": 0.7, // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable) 28 | "topp": 0.80, // 0.01 to 0.99: max probability sum of top tokens 29 | "topk": 40, // (integer) limit size of top tokens list 5..200 (0 = disable) 30 | "topp_minp": 0.05, // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 31 | "topp_eos": true, // true: limit topp list size to token with probability >= EOS 32 | "repeat_penalty": 0.05, // 0.0..2.0 repeat penalty (0.0 = disable) 33 | "repeat_penalty_n": 50, // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 34 | "eos_amp": 0.0, // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable) 35 | "eos_amp_n": 150, // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 36 | "rand_seed": 1234, // (integer) random seed 37 | 38 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars. 39 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²", 40 | 41 | "test_nan_logits": false, // test for NAN at sampling in logits (debug, problem detect) 42 | 43 | // ------------------------------------ 44 | 45 | // model load data conversion 46 | "cvt_sf16": false, // convert model to sf16 (require f16 model) 47 | "cvt_f12": true, // convert model to float12 48 | "cvt_f8": false, // convert model to float8 (required on 64Gb mem) 49 | 50 | // hardware parameters 51 | "num_procs": 22, // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances 52 | "numa_nodes": -1, // <=0: all auto detected. >0: max nodes to use 53 | "simd_mode": -1, // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2 54 | 55 | // run parameters 56 | "run_mode": 0, // 0: generate, 1:chat 57 | "gen_run_steps": -1, // generate mode run steps: <=0: model max context size, >0:user value 58 | "token_eos_str": "", // end of string token (assistant reply end) 59 | "token_eot_str": "", // end of text token (dialog/generate end) 60 | 61 | // tokens display options in chat or generate mode 62 | "tok_disp_raw": false, // true: display special tokens (LF,, etc..) 63 | "tok_disp_split": false, // true: display tokens separated with ',' 64 | "tok_disp_prob": false, // true: display sampling info (add [score + n topp] + ',') 65 | 66 | // ------------------------------------ 67 | // generate mode prompt init 68 | 69 | "gen_mode_prompt": " The explanation for the existence of seasons is", 70 | 71 | // ------------------------------------ 72 | // chat mode config 73 | 74 | // dialog colors (r.g.b format) 75 | "chat_use_colors": true, // use colors for chat 76 | "chat_col_msg": "250.250.250", // messages text color 77 | "chat_col_user": "180.255.180", // user text color (keyboard input) 78 | "chat_col_assistant": "180.180.255", // assistant answer text color 79 | 80 | // forward: define what is displayed when forward user prompt 81 | "fwd_disp_mode": 0, // 0: display nothing, 1:tokens list (test usage) 82 | 83 | // ------------------------------------ 84 | // promp mode: define the method to generate the prompt format 85 | // 0: use model_ident value to select templates defined in chat.c 86 | // 1: user defined templates cm1_xxx.. 87 | // 2: use generate mode (llama1), should work with any models (chat/non-chat). 88 | "chat_prompt_mode": 0, 89 | 90 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2) 91 | "chat_assistant_name": "Mixtral:", 92 | "chat_user_name": "User:", 93 | 94 | // ------------------------------------ 95 | // chat_prompt_mode=0 parameters 96 | 97 | "cm0_sys_prompt": "", // todo, no sys prompt with instruct model ? 98 | "cm0_user_prompt": "What is a pointer in C language ?", 99 | 100 | // ------------------------------------ 101 | // chat_prompt_mode=1 parameters (user defined template) 102 | 103 | // https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1#instruction-format 104 | "cm1_sys_template": "", // %s replace cm1_sys_prompt 105 | "cm1_user_first_template": " [INST] %s [/INST]", // first user template following sys prompt 106 | "cm1_user_template": "[INST] %s [/INST]", // %s replace cm1_user_prompt 107 | "cm1_end_template": "\n", // end of assistant reply template 108 | 109 | "cm1_sys_prompt": "", 110 | "cm1_user_prompt": "What is sizeof(int) value in C ?", 111 | 112 | // ------------------------------------ 113 | // init prompt mode 2 (generate mode) 114 | // for mode 2 correct work: 115 | // - ensure names coherence in user_template/sys_prompt/user_name_sw 116 | // - no space at end of cm2_user_name_sw 117 | // - terminate sys prompt to user name 118 | 119 | // templates for mode 2 120 | "cm2_sys_template": " %s", // %s replaced by cm2_sys_prompt, = emit bos 121 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next using keyboard input string 122 | "cm2_user_name_sw": "\nUser:", // user name switch (end template) 123 | 124 | // llama1 generate type chat example 125 | "cm2_sys_prompt": 126 | "Transcript of a dialog, where the User interacts with an assistant named Bob. " 127 | +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n" 128 | +"User: Hello Bob.\n" 129 | +"Bob: Hello. How may I help you today?\n" 130 | +"User:", 131 | "cm2_user_prompt": "What is sizeof(int) value in C ?" 132 | } 133 | -------------------------------------------------------------------------------- /run_json/run_vigogne2.json: -------------------------------------------------------------------------------- 1 | // llm run parameters. note: this file must be saved in utf-8 format 2 | { 3 | // ------------------------------------ 4 | // model identifier 5 | 6 | "model_ident": "vigogne2", // model type for model specificities, refer to model.c for list 7 | 8 | // ------------------------------------ 9 | // model load 10 | 11 | /* 12 | "model_num_safetensors": 7, 13 | "model_path": "E:/vigogne/vigogne-33b-instruct", 14 | */ 15 | 16 | "model_num_safetensors": 14, // count of .safetensors files 17 | "model_path": "E:/vigogne/vigogne-2-13b-instruct", 18 | 19 | // name of tokenizer 20 | "tokenizer_name": "", // if empty, model_path/tokenizer.json is used 21 | 22 | // ------------------------------------ 23 | // transformer parameters 24 | 25 | // rope value 26 | "rope_set": 10000.0, // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value) 27 | 28 | // ------------------------------------ 29 | // sampler parameters 30 | 31 | "temperature": 0.85, // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable) 32 | "topp": 0.70, // 0.01 to 0.99: max probability sum of top tokens 33 | "topk": 25, // (integer) limit size of top tokens list 5..200 (0 = disable) 34 | "topp_minp": 0.05, // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 35 | "topp_eos": false, // true: limit topp list size to token with probability >= EOS 36 | "repeat_penalty": 0.0, // 0.0..2.0 repeat penalty (0.0 = disable) 37 | "repeat_penalty_n": 50, // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 38 | "eos_amp": 0.0, // 0.0 to 2.0 amplify eos probability when more than eos_amp_n tokens generated. (0 = disable) 39 | "eos_amp_n": 150, // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 40 | "rand_seed": 1234, // (integer) random seed 41 | 42 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars. 43 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²", 44 | 45 | "test_nan_logits": false, // test for NAN at sampling in logits (debug, problem detect) 46 | 47 | // ------------------------------------ 48 | 49 | // model load data conversion 50 | "cvt_sf16": false, // convert model to sf16 (require f16 model) 51 | "cvt_f12": false, // convert model to float12 52 | "cvt_f8": false, // convert model to float8 (cannot with tinyllama) 53 | 54 | // hardware parameters 55 | "num_procs": 12, // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances 56 | "numa_nodes": -1, // <=0: all auto detected. >0: max nodes to use 57 | "simd_mode": -1, // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2 58 | 59 | // run parameters 60 | "run_mode": 0, // 0: generate, 1:chat 61 | "gen_run_steps": -1, // generate mode run steps: <=0: model max context size, >0:user value 62 | "token_eos_str": "", // end of string token (assistant reply end) 63 | "token_eot_str": "", // end of text token (dialog/generate end) 64 | 65 | // tokens display options in chat or generate mode 66 | "tok_disp_raw": false, // true: display special tokens (LF,, etc..) 67 | "tok_disp_split": false, // true: display tokens separated with ',' 68 | "tok_disp_prob": false, // true: display sampling info (add [score + n topp] + ',') 69 | 70 | // ------------------------------------ 71 | // generate mode prompt init 72 | 73 | "gen_mode_prompt": " L'explication de l'existence de saisons est", 74 | 75 | // ------------------------------------ 76 | // chat mode config 77 | 78 | // dialog colors (r.g.b format) 79 | "chat_use_colors": true, // use colors for chat 80 | "chat_col_msg": "250.250.250", // messages text color 81 | "chat_col_user": "180.255.180", // user text color (keyboard input) 82 | "chat_col_assistant": "180.180.255", // assistant answer text color 83 | 84 | // forward: define what is displayed when forward user prompt 85 | "fwd_disp_mode": 0, // 0: display nothing, 1:tokens list (test usage) 86 | 87 | // ------------------------------------ 88 | // promp mode: define the method to generate the prompt format 89 | // 0: use model_ident value to select templates defined in chat.c 90 | // 1: user defined templates cm1_xxx.. 91 | // 2: use generate mode (llama1), should work with any models (chat/non-chat). 92 | "chat_prompt_mode": 0, 93 | 94 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2) 95 | "chat_assistant_name": "Vigogne:", 96 | "chat_user_name": "User:", 97 | 98 | // ------------------------------------ 99 | // chat_prompt_mode=0 parameters 100 | 101 | "cm0_sys_prompt": "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.", 102 | "cm0_user_prompt": "Bonjour ! Comment ça va aujourd'hui ?", 103 | 104 | // ------------------------------------ 105 | // chat_prompt_mode=1 parameters (user defined template) 106 | /* 107 | // format for 7B, see https://huggingface.co/bofenghuang/vigogne-2-7b-chat 108 | "cm1_sys_template": "<|system|>: %s\n", 109 | "cm1_user_first_template": "", 110 | "cm1_user_template": "<|user|>: %s\n<|assistant|>:", 111 | "cm1_end_template": "\n", // end of assistant reply 112 | */ 113 | // format for 70B, see https://huggingface.co/bofenghuang/vigogne-2-70b-chat 114 | "cm1_sys_template": "[INST] <>\n%s\n<>\n\n", 115 | "cm1_user_first_template": "%s [/INST]", // first user template 116 | "cm1_user_template": "[INST] %s [/INST]", 117 | "cm1_end_template": "\n", 118 | 119 | "cm1_sys_prompt": "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.", 120 | "cm1_user_prompt": "Bonjour ! Comment ça va aujourd'hui ?", 121 | 122 | // ------------------------------------ 123 | // init prompt mode 2 (generate mode) 124 | // for mode 2 correct work: 125 | // - ensure names coherence in user_template/sys_prompt/user_name_sw 126 | // - no space at end of cm2_user_name_sw 127 | // - terminate sys prompt to user name 128 | 129 | // templates for mode 2 130 | "cm2_sys_template": " %s", // %s replaced by cm2_sys_prompt, = emit bos 131 | "cm2_user_template": " %s\nTiny:", // %s = cm2_user_prompt at init and next using keyboard input string 132 | "cm2_user_name_sw": "\nUser:", // user name switch (end template) 133 | 134 | // llama1 generate type chat example 135 | "cm2_sys_prompt": 136 | "Transcript of a dialog, where the User interacts with an assistant named Tiny. " 137 | +"Tiny is good at computer programming and never fails to respond to user requests accurately.\n\n" 138 | +"User: Hello Tiny.\n" 139 | +"Tiny: Hello. How may I help you today?\n" 140 | +"User:", 141 | "cm2_user_prompt": "What is sizeof(int) value in C ?" 142 | } 143 | -------------------------------------------------------------------------------- /src/utils/utf8.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "utf8.h" 4 | 5 | // ------------------------------------ 6 | // UTF8 7 | 8 | // encode one char to utf8, return length, 0 if error 9 | int utf8_char_encode(char *s, int code) 10 | { 11 | // 7 bits 0bbb.bbbb 12 | if (code < (1 << 7)) 13 | { 14 | s[0] = code; 15 | return 1; 16 | } 17 | // 5 + 6 bits 110b.bbbb 10bb.bbbb 18 | if (code < (1 << (5 + 6))) 19 | { 20 | s[0] = 0xc0 | (code >> 6); 21 | s[1] = 0x80 | (code & 0x3f); 22 | return 2; 23 | } 24 | // 4 + 6 + 6 bits 1110.bbbb 10bb.bbbb 10bb.bbbb 25 | if (code < (1 << (4 + 6 + 6))) 26 | { 27 | s[0] = 0xe0 | (code >> 12); 28 | s[1] = 0x80 | ((code >> 6) & 0x3f); 29 | s[2] = 0x80 | (code & 0x3f); 30 | return 3; 31 | } 32 | // 3 + 6 + 6 + 6 bits 1111.0bbb 10bb.bbbb 10bb.bbbb 10bb.bbbb 33 | if (code < (1 << (3 + 6 + 6 + 6))) 34 | { 35 | s[0] = 0xf0 | (code >> 18); 36 | s[1] = 0x80 | ((code >> 12) & 0x3f); 37 | s[2] = 0x80 | ((code >> 6) & 0x3f); 38 | s[3] = 0x80 | (code & 0x3f); 39 | return 4; 40 | } 41 | // code too big to encode 42 | return 0; 43 | } 44 | 45 | // return encoded value and length 46 | int utf8_char_decode(const char *s, int *code) 47 | { 48 | // 7 bits 0bbb.bbbb 49 | if (!(s[0] & 0x80)) 50 | { 51 | *code = s[0]; 52 | return 1; 53 | } 54 | // 5 + 6 bits 110b.bbbb 10bb.bbbb 55 | if ((s[0] & 0xe0) == 0xc0) 56 | { 57 | *code = ((s[0] & 0x1f) << 6) | (s[1] & 0x3f); 58 | if ((s[1] & 0xc0) == 0x80) 59 | return 2; 60 | } 61 | // 4 + 6 + 6 bits 1110.bbbb 10bb.bbbb 10bb.bbbb 62 | if ((s[0] & 0xf0) == 0xe0) 63 | { 64 | *code = ((s[0] & 0xf) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f); 65 | if (((s[1] & 0xc0) == 0x80) && ((s[2] & 0xc0) == 0x80)) 66 | return 3; 67 | } 68 | // 3 + 6 + 6 + 6 bits 1111.0bbb 10bb.bbbb 10bb.bbbb 10bb.bbbb 69 | if ((s[0] & 0xf8) == 0xf0) 70 | { 71 | *code = ((s[0] & 0x7) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f); 72 | if (((s[1] & 0xc0) == 0x80) && ((s[2] & 0xc0) == 0x80) && ((s[3] & 0xc0) == 0x80)) 73 | return 4; 74 | } 75 | // invalid encoding 76 | *code = 0; 77 | return 0; 78 | } 79 | 80 | // return encoded length 81 | int utf8_char_len(const char *s) 82 | { 83 | if (!(s[0] & 0x80)) 84 | return 1; 85 | if ((s[0] & 0xe0) == 0xc0) 86 | { 87 | if ((s[1] & 0xc0) == 0x80) 88 | return 2; 89 | } 90 | else 91 | if ((s[0] & 0xf0) == 0xe0) 92 | { 93 | if (((s[1] & 0xc0) == 0x80) && ((s[2] & 0xc0) == 0x80)) 94 | return 3; 95 | } 96 | else 97 | if ((s[0] & 0xf8) == 0xf0) 98 | { 99 | if (((s[1] & 0xc0) == 0x80) && ((s[2] & 0xc0) == 0x80) && ((s[3] & 0xc0) == 0x80)) 100 | return 4; 101 | } 102 | // invalid encoding 103 | return 0; 104 | } 105 | 106 | // return count of utf8 char coded in string, return 0 if coding error found 107 | int utf8_get_char_count(const char *s) 108 | { 109 | int char_count = 0; 110 | while (*s) 111 | { 112 | int l = utf8_char_len(s); 113 | if (!l) 114 | return 0; // encoding error 115 | s += l; 116 | char_count++; 117 | } 118 | return char_count; 119 | } 120 | 121 | // convert text cr + lf or single lf to single cr 122 | bool utf8_cvt_crlf_to_cr(char *s) 123 | { 124 | char *d = s; 125 | while (*s) 126 | { 127 | int l = utf8_char_len(s); 128 | if (!l) 129 | break; // encoding error 130 | 131 | if (*s == 0x0d) // CR + LF to CR 132 | { 133 | *d++ = 0x0d; 134 | s += (s[1] == 0x0a) ? 2 : 1; 135 | } 136 | else 137 | if (*s == 0x0a) // LF to CR 138 | { 139 | *d++ = 0x0d; 140 | s++; 141 | } 142 | else 143 | while (l--) 144 | *d++ = *s++; 145 | } 146 | *d = 0; 147 | return !*s; 148 | } 149 | 150 | #if 0 151 | // ----------------------------------------------- 152 | // test code 153 | // https://fr.wikipedia.org/wiki/UTF-8 154 | #include 155 | 156 | typedef struct 157 | { 158 | int code; 159 | char *s; 160 | int len; 161 | } ut_t; 162 | 163 | // wiki page examples 164 | const ut_t u_list[] = { 165 | { 159 , "\xC2\x9F", 2 }, 166 | { 160 , "\xC2\xA0", 2 }, 167 | { 191 , "\xC2\xBF", 2 }, 168 | { 192 , "\xC3\x80", 2 }, 169 | { 233 , "\xC3\xA9", 2 }, 170 | { 2047 , "\xDF\xBF", 2 }, 171 | { 2048 , "\xE0\xA0\x80", 3 }, 172 | { 8364 , "\xE2\x82\xAC", 3 }, 173 | { 55295 , "\xED\x9F\xBF", 3 }, 174 | { 57344 , "\xEE\x80\x80", 3 }, 175 | { 63743 , "\xEF\xA3\xBF", 3 }, 176 | { 63744 , "\xEF\xA4\x80", 3 }, 177 | { 64975 , "\xEF\xB7\x8F", 3 }, 178 | { 64976 , "\xEF\xB7\x90", 3 }, 179 | { 65007 , "\xEF\xB7\xAF", 3 }, 180 | { 65008 , "\xEF\xB7\xB0", 3 }, 181 | { 65533 , "\xEF\xBF\xBD", 3 }, 182 | { 65534 , "\xEF\xBF\xBE", 3 }, 183 | { 65535 , "\xEF\xBF\xBF", 3 }, 184 | { 65536 , "\xF0\x90\x80\x80", 4 }, 185 | { 119070 , "\xF0\x9D\x84\x9E", 4 }, 186 | { 131069 , "\xF0\x9F\xBF\xBD", 4 }, 187 | { 131070 , "\xF0\x9F\xBF\xBE", 4 }, 188 | { 131071 , "\xF0\x9F\xBF\xBF", 4 }, 189 | { 131072 , "\xF0\xA0\x80\x80", 4 }, 190 | { 196605 , "\xF0\xAF\xBF\xBD", 4 }, 191 | { 196606 , "\xF0\xAF\xBF\xBE", 4 }, 192 | { 196607 , "\xF0\xAF\xBF\xBF", 4 }, 193 | { 196608 , "\xF0\xB0\x80\x80", 4 }, 194 | { 262141 , "\xF0\xBF\xBF\xBD", 4 }, 195 | { 262142 , "\xF0\xBF\xBF\xBE", 4 }, 196 | { 262143 , "\xF0\xBF\xBF\xBF", 4 }, 197 | { 917504 , "\xF3\xA0\x80\x80", 4 }, 198 | { 983037 , "\xF3\xAF\xBF\xBD", 4 }, 199 | { 983038 , "\xF3\xAF\xBF\xBE", 4 }, 200 | { 983039 , "\xF3\xAF\xBF\xBF", 4 }, 201 | { 983040 , "\xF3\xB0\x80\x80", 4 }, 202 | { 1048573 , "\xF3\xBF\xBF\xBD", 4 }, 203 | { 1048574 , "\xF3\xBF\xBF\xBE", 4 }, 204 | { 1048575 , "\xF3\xBF\xBF\xBF", 4 }, 205 | { 1048576 , "\xF4\x80\x80\x80", 4 }, 206 | { 1114109 , "\xF4\x8F\xBF\xBD", 4 }, 207 | { 1114110 , "\xF4\x8F\xBF\xBE", 4 }, 208 | { 1114111 , "\xF4\x8F\xBF\xBF", 4 }, 209 | { 0, NULL, 0 } }; 210 | 211 | int main(void) 212 | { 213 | int i; 214 | for (i=0; u_list[i].s; i++) 215 | { 216 | const ut_t *u = &u_list[i]; 217 | int l, code; 218 | l = utf8_char_len(u->s); 219 | if (l != u->len) 220 | break; 221 | l = utf8_char_decode(u->s, &code); 222 | if (l != u->len) 223 | break; 224 | if (code != u->code) 225 | break; 226 | } 227 | if (u_list[i].s) 228 | printf("decode failed.\n"); 229 | 230 | // check encode 231 | for (i=0; i < (1 << (3 + 6 + 6 + 6)); i++) 232 | { 233 | char s[8]; 234 | int code; 235 | int le = utf8_char_encode(s, i); 236 | int ld = utf8_char_decode(s, &code); 237 | if (!le || (ld != le) || (code != i)) 238 | { 239 | printf("encode failed.\n"); 240 | break; 241 | } 242 | } 243 | } 244 | #endif -------------------------------------------------------------------------------- /run_json/run_llama3.json: -------------------------------------------------------------------------------- 1 | // llm run parameters. note: this file must be saved in utf-8 format 2 | { 3 | // ------------------------------------ 4 | // model identifier 5 | 6 | "model_ident": "llama3", // define model type for model specificities, refer to model.c for list 7 | 8 | // ------------------------------------ 9 | // model load 10 | 11 | "model_num_safetensors": 4, // count of .safetensors files 12 | "model_path": "D:/llama3_st/8b-instruct", // path to .safetensors, config.json 13 | 14 | // name of tokenizer 15 | "tokenizer_name": "", // if empty: model_path/tokenizer.json used else define full path+name 16 | 17 | // ------------------------------------ 18 | // transformer parameters 19 | 20 | // rope value 21 | "rope_set": 0, // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value) 22 | 23 | // ------------------------------------ 24 | // sampler parameters 25 | "temperature": 1.0, // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable) 26 | "topp": 0.65, // 0.01 to 0.99: max probability sum of top tokens 27 | "topk": 40, // (integer) limit size of top tokens list 5..200 (0 = disable) 28 | "topp_minp": 0.05, // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list 29 | "topp_eos": false, // true: limit topp list size to token with probability >= EOS 30 | "repeat_penalty": 0.0, // 0.0..2.0 repeat penalty (0.0 = disable) 31 | "repeat_penalty_n": 0, // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10) 32 | "eos_amp": 0.5, // 0.0 to 2.0 amplify eos probability when more than eos_amp_n tokens generated. (0 = disable) 33 | "eos_amp_n": 250, // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10) 34 | "rand_seed": 1234, // (integer) random seed 35 | 36 | // (optional) if ch_restrict defined, allow to sample only tokens that contain ascii chars and utf8 chars contained in sample_restrict string. 37 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²", 38 | 39 | "test_nan_logits": false, // test for NAN at sampling in logits result 40 | 41 | // ------------------------------------ 42 | 43 | // model load data conversion 44 | "cvt_sf16": false, // convert model to sf16 (require f16 model) 45 | "cvt_f12": false, // convert model to float12 (should be possible with all models, require all weights <= 4.0) 46 | "cvt_f8": false, // convert model to float8 (not possible with some models, require all weights <= 2.0) 47 | 48 | // hardware parameters 49 | "num_procs": 12, // -1: max auto detected (may be adjusted), >0: user value. note: max procs do not always produce best performances 50 | "numa_nodes": -1, // -1: use all detected. 0: skip numa specific code, >0: max nodes to use 51 | "simd_mode": -1, // -1: max detect, 0:fpu 1:sse 2:avx, 3:avx2 52 | 53 | // run parameters 54 | "run_mode": 0, // 0: generate, 1:chat 55 | "gen_run_steps": -1, // generate mode run steps: <=0: model max context size, >0:user value 56 | "token_eos_str": "<|eot_id|>", // end of string token (assistant reply end) 57 | "token_eot_str": "<|end_of_text|>", // end of text token (dialog/generate end) 58 | 59 | // tokens display options in chat or generate mode 60 | "tok_disp_raw": false, // true: display control/byte for tokens (LF,, etc) 61 | "tok_disp_split":false, // true: display token list separated with ',' 62 | "tok_disp_prob": false, // true: display sampling info (add [score + n topp] + ',') 63 | 64 | // ------------------------------------ 65 | // generate mode config 66 | 67 | "gen_mode_prompt": "<|begin_of_text|>The explanation for the existence of seasons is", 68 | 69 | // ------------------------------------ 70 | // chat mode config 71 | 72 | // dialog colors (r.g.b format) 73 | "chat_use_colors": true, // use colors for chat 74 | "chat_col_msg": "250.250.250", // messages text color 75 | "chat_col_user": "180.255.180", // user text color (keyboard input) 76 | "chat_col_assistant": "180.180.255", // assistant answer text color 77 | 78 | // forward: define what is displayed when forward user prompt 79 | "fwd_disp_mode": 0, // 0: display nothing, 1:tokens list (test/check mode) 80 | 81 | // ------------------------------------ 82 | // promp mode: define the method to generate the prompt format 83 | // 0: use model_ident value to select templates defined in chat.c 84 | // 1: user defined templates cm1_xxx.. 85 | // 2: use generate mode (llama1), should work with any models (chat/non-chat). 86 | "chat_prompt_mode": 0, // 0: use model_ident value to define templates. 1, user defined template, 2 use generate mode (llama1) 87 | 88 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2) 89 | "chat_assistant_name": "LLama3:", 90 | "chat_user_name": "User:", 91 | 92 | // ------------------------------------ 93 | // chat_prompt_mode=0 parameters 94 | 95 | "cm0_sys_prompt": "You are a helpful AI assistant for travel tips and recommendations", 96 | "cm0_user_prompt": "What is France's capital?", 97 | 98 | // ------------------------------------ 99 | // chat_prompt_mode=1 parameters (user defined template) 100 | 101 | // https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-3/ 102 | // https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1 103 | "cm1_sys_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n%s<|eot_id|>", 104 | "cm1_user_first_template": "", 105 | "cm1_user_template": "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", 106 | "cm1_end_template": "<|eot_id|>\n", 107 | 108 | "cm1_sys_prompt": "You are a helpful assistant.", 109 | "cm1_user_prompt": "Explain shortly what is a pointer in C language ?", 110 | 111 | // ------------------------------------ 112 | // init prompt mode 2 (generate mode) 113 | // for mode 2 correct work: 114 | // - ensure defined names coherences in user_template/sys_prompt/user_name_sw 115 | // - ensure no space at end of user_name_sw 116 | // - terminate sys prompt to user name 117 | 118 | // templates required for mode 1 or 2 119 | "cm2_sys_template": "<|begin_of_text|>%s", // %s replaced by cm2_sys_prompt, = emit bos 120 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next with keyboard input string 121 | // user name switch, required for mode 2 only, string detect switch to user in sys prompt (size 5..10 char, no space at end) 122 | "cm2_user_name_sw": "\nUser:", 123 | 124 | // here is llama.cpp project llama1 chat example 125 | "cm2_sys_prompt": 126 | "Transcript of a dialog, where the User interacts with an Assistant named Bob. " 127 | +"Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\n\n" 128 | +"User: Hello, Bob.\n" 129 | +"Bob: Hello. How may I help you today?\n" 130 | +"User:", 131 | "cm2_user_prompt": "Do you know what is the first prime number greater than 15 ?" 132 | } 133 | --------------------------------------------------------------------------------