├── doc
    ├── chat_ll2.jpg
    ├── gen_ll2.jpg
    └── linux_build.txt
├── src
    ├── vs_inc
    │   ├── stdbool.h
    │   └── stdint.h
    ├── model
    │   ├── load
    │   │   ├── load_tokenizer.c
    │   │   ├── load_transformer.h
    │   │   ├── load_tokenizer.h
    │   │   └── json.h
    │   ├── omp_numa.h
    │   ├── sampler.h
    │   ├── tr_opt_inc.c
    │   ├── tokenizer.h
    │   ├── kv_cache.c
    │   └── model.h
    ├── matmul
    │   ├── tr_opt_simd.h
    │   ├── w_types.h
    │   ├── matmul_f32.c
    │   ├── matmul_priv.h
    │   ├── mm_hsum.h
    │   ├── matmul.h
    │   ├── matmul_f16.c
    │   ├── matmul_bf16.c
    │   ├── tr_opt_simd.c
    │   └── matmul_sf16.c
    ├── utils
    │   ├── mem_alloc.h
    │   ├── utf8.h
    │   ├── time_ev.h
    │   ├── time_ev.c
    │   ├── term_utf8.h
    │   ├── numa.h
    │   ├── l_util.h
    │   ├── mem_alloc.c
    │   ├── numa_w.c
    │   └── utf8.c
    ├── dir_info.txt
    ├── main.c
    └── generate.c
├── tests
    ├── 1_node
    │   ├── gen_f12
    │   │   ├── gen_vigogne2.txt
    │   │   ├── gen_codellama.txt
    │   │   ├── gen_llama1.txt
    │   │   ├── gen_tinyllama.txt
    │   │   ├── gen_llama2.txt
    │   │   ├── gen_zephyr.txt
    │   │   ├── gen_llama3.txt
    │   │   ├── gen_mistral.txt
    │   │   ├── gen_llama31.txt
    │   │   └── gen_qwen2.txt
    │   ├── gen_ref
    │   │   ├── gen_vigogne2.txt
    │   │   ├── gen_codellama.txt
    │   │   ├── gen_llama1.txt
    │   │   ├── gen_llama2.txt
    │   │   ├── gen_tinyllama.txt
    │   │   ├── gen_mathstral_fp32.txt
    │   │   ├── gen_llama3.txt
    │   │   ├── gen_mistral.txt
    │   │   ├── gen_zephyr.txt
    │   │   ├── gen_mixtral_f8.txt
    │   │   └── gen_qwen2.txt
    │   └── res_1socket.txt
    ├── dir_info.txt
    └── 2_nodes
    │   ├── res_2sockets.txt
    │   └── llama2_ht_off.txt
├── LICENSE
├── llama_st.sln
├── make_gcc.txt
└── run_json
    ├── run_llama1.json
    ├── run_codellama.json
    ├── run_mathstral.json
    ├── run_mistral.json
    ├── run_tinyllama.json
    ├── run_zephyr.json
    ├── run_llama2.json
    ├── run_mixtral.json
    ├── run_vigogne2.json
    └── run_llama3.json


/doc/chat_ll2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierrel55/llama_st/HEAD/doc/chat_ll2.jpg


--------------------------------------------------------------------------------
/doc/gen_ll2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierrel55/llama_st/HEAD/doc/gen_ll2.jpg


--------------------------------------------------------------------------------
/src/vs_inc/stdbool.h:
--------------------------------------------------------------------------------
1 | typedef unsigned int bool;
2 | #define true 1
3 | #define false 0
4 | 


--------------------------------------------------------------------------------
/src/model/load/load_tokenizer.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierrel55/llama_st/HEAD/src/model/load/load_tokenizer.c


--------------------------------------------------------------------------------
/tests/1_node/gen_f12/gen_vigogne2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierrel55/llama_st/HEAD/tests/1_node/gen_f12/gen_vigogne2.txt


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_vigogne2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierrel55/llama_st/HEAD/tests/1_node/gen_ref/gen_vigogne2.txt


--------------------------------------------------------------------------------
/src/vs_inc/stdint.h:
--------------------------------------------------------------------------------
1 | typedef __int64 int64_t;
2 | typedef unsigned __int64 uint64_t;
3 | typedef int int32_t;
4 | typedef unsigned int uint32_t;
5 | 


--------------------------------------------------------------------------------
/src/model/load/load_transformer.h:
--------------------------------------------------------------------------------
1 | // load config
2 | void load_checkpoint_config(void);
3 | 
4 | // load checkpoint weights
5 | void load_checkpoint_weights(void);
6 | 


--------------------------------------------------------------------------------
/doc/linux_build.txt:
--------------------------------------------------------------------------------
1 | Linux build require a linux port for sources:
2 |   - term_utf8_w.c
3 |   - numa_w.c
4 |   - time_ev.c
5 | 
6 | Other sources should build without changes for x86 64 arch.
7 | 


--------------------------------------------------------------------------------
/src/model/load/load_tokenizer.h:
--------------------------------------------------------------------------------
1 | // find merge datas, return NULL if not found (code in load_tokenizer.c)
2 | const struct merge_id_t *bpe_find_merge(int l_id, int r_id);
3 | 
4 | // load (private to tokenizer.c, defined in load_tokenizer.c)
5 | void load_tokenizer(const char *file_name);
6 | 


--------------------------------------------------------------------------------
/tests/dir_info.txt:
--------------------------------------------------------------------------------
1 | Contain speed test with and without data conversion and hyperthreading on/off using various models.
2 | 
3 | 1_node:
4 | tested with xeon e5 2680 v4 (14c 2400Mhz), 64Gb ram ddr4 2400 4ch
5 | 
6 | 2_nodes:
7 | tested with 2 * xeon e5 2650 v4 (12c 2200Mhz), 128Gb ram ddr4 2400 (64Gb/socket)
8 | 


--------------------------------------------------------------------------------
/src/matmul/tr_opt_simd.h:
--------------------------------------------------------------------------------
 1 | // -----------------------------------------------------
 2 | // simd optimized head attention (code in tr_opt_simd.c)
 3 | 
 4 | typedef void (* head_att_opt_t)(float *xb, int n_tok, float *att, const float *q, const float *k, const float *v, const struct transformer_config_t *p);
 5 | 
 6 | // head attention simd (defined by matmul_init())
 7 | extern head_att_opt_t head_att_opt;
 8 | 
 9 | // init
10 | void init_head_att_opt(enum e_simd_typ simd_typ);
11 | 


--------------------------------------------------------------------------------
/src/utils/mem_alloc.h:
--------------------------------------------------------------------------------
 1 | // ------------------------------------
 2 | // memory allocation with check
 3 | 
 4 | void *malloc_check(size_t size);
 5 | void *calloc_check(size_t size);
 6 | void *realloc_check(void *ptr, size_t size);
 7 | void free_check(void *ptr);
 8 | 
 9 | // print currently allocated size
10 | void dbg_print_alloc(void);
11 | 
12 | // alloc string
13 | char *str_alloc(const char *str, int len);
14 | 
15 | #define VAR_ALLOC(var, typ, ne) typ *var = (typ *)malloc_check((ne)*sizeof(typ))
16 | 


--------------------------------------------------------------------------------
/src/matmul/w_types.h:
--------------------------------------------------------------------------------
 1 | // weights data types
 2 | enum e_w_type
 3 | {
 4 |   w_type_f32 = 0,
 5 |   w_type_f16,
 6 |   w_type_bf16,
 7 |   w_type_sf16,
 8 |   w_type_f12,
 9 |   w_type_f8,
10 |   w_type_COUNT,
11 | };
12 | 
13 | // types sizeof
14 | static const unsigned int w_type_sizeof[w_type_COUNT] = { 4, 2, 2, 2, 2, 1 };
15 | 
16 | // names of types (in matmul.c)
17 | extern const char *w_type_name[w_type_COUNT];
18 | 
19 | // C types
20 | typedef unsigned short f16_t;
21 | typedef unsigned short bf16_t;
22 | typedef unsigned short sf16_t;
23 | typedef unsigned short f12_t;
24 | typedef unsigned char f8_t;
25 | 


--------------------------------------------------------------------------------
/src/utils/utf8.h:
--------------------------------------------------------------------------------
 1 | // ------------------------------------
 2 | // UTF8
 3 | 
 4 | // encode one char to utf8, return length, 0 if error
 5 | int utf8_char_encode(char *s, int code);
 6 | 
 7 | // return char encoded code value and length
 8 | int utf8_char_decode(const char *s, int *code);
 9 | 
10 | // return char encoded length and test if coding is valid
11 | int utf8_char_len(const char *s);
12 | 
13 | // return count of utf8 char coded in string, return 0 if coding error found
14 | int utf8_get_char_count(const char *s);
15 | 
16 | // text convert cr + lf or lf alone to cr
17 | bool utf8_cvt_crlf_to_cr(char *s);
18 | 


--------------------------------------------------------------------------------
/src/dir_info.txt:
--------------------------------------------------------------------------------
 1 | directory matmul
 2 | contain:
 3 |  - simd optimized matmul/float conversion code. interface matmul.h
 4 |  - simd optimized code for transformer self-attention (tr_opt_simd.c)
 5 |  
 6 | directory model
 7 | contain:
 8 |   - tokenizer/transformer/sampler code
 9 |  
10 | directory model/load
11 | contain:
12 |    - json parser
13 |    - tokenizer/transformer loader
14 |    
15 | directory vs_inc
16 | Visual studio 2012 express was used to build the project, but stdbool.h and stdint.h are not defined as standard headers with this compiler, so a minimal definition is done here.
17 | It is not required for GCC build.
18 |    
19 | directory util
20 | contain various utils non-llm specific code
21 | 


--------------------------------------------------------------------------------
/tests/1_node/res_1socket.txt:
--------------------------------------------------------------------------------
 1 | f16/bf16
 2 | 
 3 | llama1 7B fp16 231 5.04
 4 | llama2 7B fp16 249 5.09
 5 | llama3 8B bf16 268 4.50
 6 | llama31 8B bf16 398 4.50
 7 | codellama 7B bf16 231 5.10
 8 | mistral 7B bf16 331 4.71
 9 | zephyr 7B bf16 294 4.77
10 | vigogne2 13B fp16 605 2.57
11 | tinyllama 1.1B bf16 297 31.11
12 | qwen2 7B bf16 543 4.75
13 | qwen2 0.5B bf16 67 55.05
14 | 
15 | fp32
16 | 
17 | mathstral 7B fp32 231 2.40
18 | 
19 | using f12 conv
20 | 
21 | llama1 7B fp16 231 6.17
22 | llama2 7B fp16 249 6.21
23 | llama3 8B bf16 268 5.53
24 | llama31 8B bf16 398 5.51
25 | codellama 7B bf16 231 6.26
26 | mistral 7B bf16 396 5.88
27 | zephyr 7B bf16 247 5.95
28 | vigogne2 13B fp16 781 3.15
29 | tinyllama 1.1B bf16 237 37.15
30 | qwen2 7B bf16 543 5.83
31 | 
32 | using f8 conv
33 | 
34 | mixtral 8x7B bf16 384 4.84
35 | 


--------------------------------------------------------------------------------
/src/utils/time_ev.h:
--------------------------------------------------------------------------------
 1 | // -----------------------------------------
 2 | // debug/optimization: eval operations times
 3 | 
 4 | // enable to eval times with debugger
 5 | // #define OPT_EVAL_TIMES
 6 | 
 7 | #ifdef OPT_EVAL_TIMES
 8 | 
 9 | #define MAX_TIMES 16
10 | 
11 | struct op_time_t
12 | {
13 |   uint64_t t0;
14 |   uint64_t t_sum;
15 |   int n_call;
16 | };
17 | 
18 | extern struct op_time_t op_time[MAX_TIMES];
19 | 
20 | uint64_t get_time_ctr(void);
21 | 
22 | static __inline void tm_stop(int id)
23 | {
24 |   struct op_time_t *t = &op_time[id];
25 |   t->t_sum += (get_time_ctr() - t->t0);
26 |   t->n_call++;
27 | }
28 | 
29 | void tm_print(void);
30 | 
31 | #define T_START(id) op_time[id].t0 = get_time_ctr()
32 | #define T_STOP(id) tm_stop(id)
33 | #define T_RESET() memset(op_time, 0, sizeof(op_time))
34 | #define T_CLR(id) op_time[id].t_sum = 0; op_time[id].n_call = 0
35 | #define T_PRINT() tm_print()
36 | 
37 | #else
38 | 
39 | #define T_RESET()
40 | #define T_START(id)
41 | #define T_STOP(id)
42 | #define T_CLR(id)
43 | #define T_PRINT()
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/utils/time_ev.c:
--------------------------------------------------------------------------------
 1 | // -------------------------------------------
 2 | // time eval
 3 | 
 4 | #include <stdint.h>
 5 | #include "time_ev.h"
 6 | 
 7 | #ifdef OPT_EVAL_TIMES
 8 | 
 9 | #include <windows.h>
10 | #include <stdio.h>
11 | 
12 | struct op_time_t op_time[MAX_TIMES] = { 0 };
13 | 
14 | uint64_t time_ctr_freq = 0;
15 | 
16 | static void init_time_ctr_freq(void)
17 | {
18 |   LARGE_INTEGER freq;
19 |   QueryPerformanceFrequency(&freq);
20 |   time_ctr_freq = freq.QuadPart;
21 | }
22 | 
23 | uint64_t get_time_ctr(void)
24 | {
25 |   LARGE_INTEGER ticks;
26 |   QueryPerformanceCounter(&ticks);
27 |   return ticks.QuadPart;
28 | }
29 | 
30 | // print all times
31 | void tm_print(void)
32 | {
33 |   int i;
34 |   if (!time_ctr_freq)
35 |     init_time_ctr_freq();
36 | 
37 |   printf("\n----------\ntime list:\n");
38 |   for (i=0; i<MAX_TIMES; i++)
39 |   {
40 |     struct op_time_t *t = &op_time[i];
41 |     if (t->n_call)
42 |       printf("time[%d]: nc:%d\t  dt:%.4f s\n", i, t->n_call, (double)t->t_sum/time_ctr_freq);
43 |   }
44 |   printf("----------\n");
45 | }
46 | 
47 | #endif


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 pierrel55
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/model/omp_numa.h:
--------------------------------------------------------------------------------
 1 | // omp for numa support
 2 | 
 3 | // thread list numa map
 4 | struct numa_thread_map_t
 5 | {
 6 |   int nt_mp;                   // num threads in main process
 7 |   int n_threads;
 8 |   unsigned char tid_to_proc_id[MAX_NUMA_PROCS];
 9 |   unsigned char tid_to_node_id[MAX_NUMA_PROCS];
10 | };
11 | 
12 | extern struct numa_thread_map_t numa_map;
13 | 
14 | // get weight data dy split size for y
15 | #define WD_GET_DY(y, dy, wy) ((y + dy) <= wy) ? dy : wy - y
16 | 
17 | // return sizeof wd ne elements in bytes (usage required where f12 can be used)
18 | size_t wd_ne_sizeof(const struct w_dat_t *wd, size_t ne);
19 | 
20 | // split and alloc weight datas in different memory nodes for numa configurations.
21 | void numa_alloc_wd(struct w_dat_t *wd, int nz, int wy, int wx, enum e_w_type w_type, bool mm_split);
22 | 
23 | // copy or load datas to weights for one z unit (layer).
24 | void numa_cpy_wd_z(struct w_dat_t *wd, int z_id, const void *s, file_t *f);
25 | 
26 | // init OMP for numa configuration
27 | void numa_init_omp(int cfg_n_procs, int cfg_n_nodes);
28 | 
29 | // check omp thread proc match numa_map configuration
30 | void omp_proc_bind_numa_check(void);
31 | 


--------------------------------------------------------------------------------
/src/utils/term_utf8.h:
--------------------------------------------------------------------------------
 1 | // UTF8 terminal
 2 | 
 3 | // define a RGB color using "r.g.b" string, ex: "180.255.180"
 4 | int term_get_color(const char *col_str);
 5 | 
 6 | // define user_col[col_id] RGB color, must be done before call to term_init()
 7 | void term_def_color(int col_id, int color);
 8 | 
 9 | // set print color
10 | void text_color(int col_id);
11 | 
12 | // single init, required if text_color() used
13 | void term_init(void);
14 | 
15 | // wait for ms (debug usage)
16 | void term_wait_ms(int ms);
17 | 
18 | // print UTF8 string and manage cr alone as cr + lf
19 | bool print_utf8(const char *s);
20 | 
21 | // print UTF8 string and display control chars code (debug/check usage)
22 | void print_utf8_raw(const char *s);
23 | 
24 | // ensure cursor position at new line
25 | void cursor_nl(void);
26 | void cursor_nl_set(void);
27 | 
28 | // keyboard input a string, return utf8 encoded buffer size
29 | int kbd_input_utf8(char *s, int s_sizeof);
30 | 
31 | // read a key without wait
32 | int read_key(void);
33 | 
34 | // sleep for ms time
35 | void sleep_ms(int ms);
36 | 
37 | // clipboard (chat menu), copy utf8 text to clipboard
38 | void term_cb_clear(void);
39 | void term_cb_add_utf8(const char *utf8);
40 | void term_cb_copy(void);
41 | 


--------------------------------------------------------------------------------
/llama_st.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Express 2012 for Windows Desktop
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "llama_st", "llama_st.vcxproj", "{E88EB555-D508-48F5-8C4E-C3E6C9CC9903}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|Win32 = Debug|Win32
 9 | 		Debug|x64 = Debug|x64
10 | 		Release|Win32 = Release|Win32
11 | 		Release|x64 = Release|x64
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Debug|Win32.ActiveCfg = Debug|Win32
15 | 		{E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Debug|Win32.Build.0 = Debug|Win32
16 | 		{E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Debug|x64.ActiveCfg = Debug|x64
17 | 		{E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Debug|x64.Build.0 = Debug|x64
18 | 		{E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Release|Win32.ActiveCfg = Release|Win32
19 | 		{E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Release|Win32.Build.0 = Release|Win32
20 | 		{E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Release|x64.ActiveCfg = Release|x64
21 | 		{E88EB555-D508-48F5-8C4E-C3E6C9CC9903}.Release|x64.Build.0 = Release|x64
22 | 	EndGlobalSection
23 | 	GlobalSection(SolutionProperties) = preSolution
24 | 		HideSolutionNode = FALSE
25 | 	EndGlobalSection
26 | EndGlobal
27 | 


--------------------------------------------------------------------------------
/src/utils/numa.h:
--------------------------------------------------------------------------------
 1 | // numa informations.
 2 | #define MAX_NUMA_PROCS 64    // max supported procs (need to manage processor group if more needed, or can also disable hyperthreading in BIOS)
 3 | #define MAX_NUMA_NODES 8     // max supported nodes (can be increased with current code)
 4 | 
 5 | // numa informations
 6 | struct numa_inf_t
 7 | {
 8 |   int mt_node;               // main thread node
 9 |   int mt_procs;              // main thread node proc count
10 |   int n_nodes;               // nodes count
11 |   int n_procs;               // physical processors count
12 |   unsigned char proc_list[MAX_NUMA_PROCS];   // procs list batched with same node id
13 |   unsigned char proc_node[MAX_NUMA_PROCS];   // node id for each processor in proc_list
14 |   unsigned char node_nprocs[MAX_NUMA_NODES]; // proc count in each node
15 | };
16 | 
17 | // global numa informations, use as read only
18 | extern struct numa_inf_t numa;
19 | 
20 | // init numa struct
21 | void init_numa_info(void);
22 | 
23 | // display mem available in nodes
24 | void numa_disp_mem(void);
25 | 
26 | // set proc for current thread
27 | bool numa_set_thread_proc(int proc_id);
28 | 
29 | // return proc for current thread
30 | int numa_get_thread_proc(void);
31 | 
32 | // --------------------------
33 | // memory alloc/free
34 | 
35 | // reserve physical memory in node
36 | void *numa_alloc(size_t sz, int node);
37 | 
38 | // free memory allocated with numa_alloc
39 | void numa_free(void *p);
40 | 


--------------------------------------------------------------------------------
/src/main.c:
--------------------------------------------------------------------------------
 1 | #include "l_util.h"
 2 | #include "model.h"
 3 | 
 4 | #ifdef CHECK_EXIT
 5 | #include "mem_alloc.h"
 6 | #include "omp_numa.h"
 7 | #endif
 8 | 
 9 | int main(int argc, char *argv[])
10 | {
11 |   if (APP_ERROR())                 // catch error return point
12 |     return -1;
13 | 
14 | #if 1
15 |   if (argc != 2)
16 |   {
17 |     msg_info("Usage: llama_st <run_config.json>\n");
18 |     msg_info("Example: llama_st run_json/run_llama2.json\n");
19 |     return -1;
20 |   }
21 |   build_model(argv[1]);
22 | #else
23 |   // dev mode
24 |   build_model("run_json/run_tinyllama.json");
25 |   //build_model("run_json/run_llama1.json");
26 |   //build_model("run_json/run_llama2.json");
27 |   //build_model("run_json/run_codellama.json");
28 |   //build_model("run_json/run_mistral.json");
29 |   //build_model("run_json/run_mathstral.json");
30 |   //build_model("run_json/run_zephyr.json");
31 |   //build_model("run_json/run_mixtral.json");
32 |   //build_model("run_json/run_vigogne2.json");
33 |   //build_model("run_json/run_llama3.json");
34 |   //build_model("run_json/run_llama3.1.json");
35 |   //build_model("run_json/run_qwen2.5.json");
36 | #endif
37 | 
38 |   // run generate or chat
39 |   if (model.config.run_mode == run_mode_generate)
40 |     generate();
41 |   else 
42 |   if (model.config.run_mode == run_mode_chat)
43 |     chat();
44 |   else
45 |     msg_info("undefined run mode: %d\n", model.config.run_mode);
46 | 
47 |   // free memory
48 |   free_model();
49 | 
50 | #ifdef CHECK_EXIT
51 |   // some exit checks
52 |   omp_proc_bind_numa_check();   // check no change occured
53 |   dbg_print_alloc();            // check free
54 |   wait_return_exit();           // press return to exit
55 | #endif
56 | 
57 |   return 0;
58 | }
59 | 


--------------------------------------------------------------------------------
/src/matmul/matmul_f32.c:
--------------------------------------------------------------------------------
 1 | #include <intrin.h>
 2 | #include "mm_hsum.h"
 3 | #include "w_types.h"
 4 | #include "matmul.h"
 5 | 
 6 | // ------------------------------------------------------------------
 7 | // f32 * f32 => f32
 8 | // ------------------------------------------------------------------
 9 | 
10 | static void matmul_f32_f32_fpu(float *res, const float *vec, const float *mat, int len_vec, int y_mat)
11 | {
12 |   const float *m, *m_end = mat + y_mat * len_vec;
13 |   for (m=mat; m!=m_end; m+=len_vec)
14 |   {
15 |     float acc = 0;
16 |     int i;
17 |     for (i=0; i!=len_vec; i++)
18 |       acc += vec[i] * m[i];
19 |     *res++ = acc;
20 |   }
21 | }
22 | 
23 | static void matmul_f32_f32_sse(float *res, const float *vec, const float *mat, int len_vec, int y_mat)
24 | {
25 |   const float *m, *m_end = mat + y_mat * len_vec;
26 |   for (m=mat; m!=m_end; m+=len_vec)
27 |   {
28 |     __m128 acc = _mm_setzero_ps();              // init 0 in sum
29 |     int i;
30 |     for (i=0; i!=len_vec; i+=4)
31 |       acc = _mm_fmadd_ps(_mm_load_ps(vec + i), _mm_load_ps(m + i), acc);
32 |     *res++ = hsum_ps_sse(acc);
33 |   }
34 | }
35 | 
36 | static void matmul_f32_f32_avx1(float *res, const float *vec, const float *mat, int len_vec, int y_mat)
37 | {
38 |   const float *m, *m_end = mat + y_mat * len_vec;
39 |   for (m=mat; m!=m_end; m+=len_vec)
40 |   {
41 |     __m256 acc = _mm256_setzero_ps();
42 |     int i;
43 |     for (i=0; i!=len_vec; i+=8)
44 |       acc = _mm256_fmadd_ps(_mm256_load_ps(vec + i), _mm256_load_ps(m + i), acc);
45 |     *res++ = hsum_ps_avx1(acc);
46 |   }
47 | }
48 | 
49 | // init functions list
50 | const matmul_f32_f32_t matmul_f32_f32_procs[simd_n] =
51 | {
52 |   matmul_f32_f32_fpu,
53 |   matmul_f32_f32_sse,
54 |   matmul_f32_f32_avx1,
55 |   NULL,
56 | };
57 | 


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_codellama.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_codellama.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/codellama/codellama-7b-instruct-hf/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/codellama/codellama-7b-instruct-hf/config.json
 7 | torch float type: bf16
 8 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
 9 | processor(s) core(s) used: 12 in 1 node(s).
10 | load: E:/codellama/codellama-7b-instruct-hf/model-00001-of-00002.safetensors
11 | load: E:/codellama/codellama-7b-instruct-hf/model-00002-of-00002.safetensors
12 | sampler config:
13 |   temperature      : 0.60
14 |   topp             : 0.90
15 |   topk             : 30
16 |   topp_minp        : 0.05
17 |   topp_eos         : true
18 |   repeat_penalty   : 0.00
19 |   repeat_penalty_n : 50
20 |   eos_amp          : 0.50
21 |   eos_amp_n        : 150
22 |   rand seed        : 1234
23 | Generate: max 16384 tokens..
24 | - Press 'esc' key to break generation.
25 | bool is_prime(int x)
26 | {
27 |     if (x < 2)
28 |         return false;
29 | 
30 |     if (x % 2 == 0)
31 |         return x == 2;
32 | 
33 |     int root = (int)std::sqrt(x);
34 |     for (int i = 3; i <= root; i += 2)
35 |         if (x % i == 0)
36 |             return false;
37 | 
38 |     return true;
39 | }
40 | 
41 | int main()
42 | {
43 |     int n;
44 |     cin >> n;
45 | 
46 |     vector<int> nums(n);
47 |     for (int i = 0; i < n; ++i)
48 |     {
49 |         cin >> nums[i];
50 |     }
51 | 
52 |     int count = 0;
53 |     for (int i = 0; i < n; ++i)
54 |     {
55 |         if (is_prime(nums[i]))
56 |             count++;
57 |     }
58 | 
59 |     cout << count;
60 | 
61 |     return 0;
62 | }
63 | total time: 45.26s for 231 tokens, tok/s: 5.10
64 | Press any key to continue . . .


--------------------------------------------------------------------------------
/src/generate.c:
--------------------------------------------------------------------------------
 1 | #include "l_util.h"
 2 | #include "model.h"
 3 | #include "term_utf8.h"
 4 | #include "time_ev.h"
 5 | 
 6 | // re-use function defined in chat.c
 7 | void tokenizer_decode_print_ex(int token_id, float prob);
 8 | 
 9 | // generation loop
10 | void generate(void)
11 | {
12 |   int i, t0, t1, n_gen, run_steps;
13 |   struct run_conf_t *conf = &model.config;
14 |   const struct mt_list_t *mt_list = &model.tokenizer.mt_list;
15 | 
16 |   msg_info("Generate: max %d tokens..\n", conf->gen_run_steps);
17 |   msg_info("- Press 'esc' key to break generation.\n");
18 |   T_RESET();                   // dev mode, eval code time
19 | 
20 |   // time stats
21 |   t0 = time_in_ms();
22 | 
23 |   // forward init prompt
24 |   tokenizer_encode(conf->gen_mode_prompt);
25 |   for (i=0; i<mt_list->n_list; i++)
26 |   {
27 |     int token = mt_list->mt[i].tok_id;
28 |     forward(token, false, i == (mt_list->n_list-1));
29 |     tokenizer_decode_print_ex(token, -1.0f);
30 |   }
31 | 
32 |   // generate
33 |   for (run_steps = 0; run_steps != conf->gen_run_steps; run_steps++)
34 |   {
35 |     // get token from logits into samp
36 |     struct prob_index_t *pi = sampler_sample();
37 | 
38 |     // stop gen is s key pressed
39 |     if (read_key() == 27)
40 |     {
41 |       msg_info("{esc stop}");
42 |       break;
43 |     }
44 | 
45 |     // data-dependent terminating condition
46 |     if (   (pi->index == model.config.token_eos)
47 |         || (pi->index == model.config.token_eot))
48 |       break;
49 | 
50 |     // print generated token
51 |     tokenizer_decode_print_ex(pi->index, pi->prob);
52 | 
53 |     // update logits
54 |     forward(pi->index, true, true);
55 |   }
56 | 
57 |   T_PRINT();
58 | 
59 |   // time elapsed
60 |   t1 = time_in_ms();
61 |   n_gen = model.transformer.state.cache.n_tokens;
62 |   msg_info("\ntotal time: %.2fs for %d tokens, tok/s: %.2f\n", (t1-t0) / 1000.0, n_gen, n_gen*1000.0 / (t1-t0));
63 | }
64 | 


--------------------------------------------------------------------------------
/tests/1_node/gen_f12/gen_codellama.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_codellama.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/codellama/codellama-7b-instruct-hf/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/codellama/codellama-7b-instruct-hf/config.json
 7 | torch float type: bf16
 8 | model weights converted to float12.
 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
10 | processor(s) core(s) used: 12 in 1 node(s).
11 | load: E:/codellama/codellama-7b-instruct-hf/model-00001-of-00002.safetensors
12 | load: E:/codellama/codellama-7b-instruct-hf/model-00002-of-00002.safetensors
13 | sampler config:
14 |   temperature      : 0.60
15 |   topp             : 0.90
16 |   topk             : 30
17 |   topp_minp        : 0.05
18 |   topp_eos         : true
19 |   repeat_penalty   : 0.00
20 |   repeat_penalty_n : 50
21 |   eos_amp          : 0.50
22 |   eos_amp_n        : 150
23 |   rand seed        : 1234
24 | Generate: max 16384 tokens..
25 | - Press 'esc' key to break generation.
26 | bool is_prime(int x)
27 | {
28 |     if (x < 2)
29 |         return false;
30 | 
31 |     if (x % 2 == 0)
32 |         return x == 2;
33 | 
34 |     int root = (int)std::sqrt(x);
35 |     for (int i = 3; i <= root; i += 2)
36 |         if (x % i == 0)
37 |             return false;
38 | 
39 |     return true;
40 | }
41 | 
42 | int main()
43 | {
44 |     int n;
45 |     cin >> n;
46 | 
47 |     vector<int> nums(n);
48 |     for (int i = 0; i < n; ++i)
49 |     {
50 |         cin >> nums[i];
51 |     }
52 | 
53 |     int count = 0;
54 |     for (int i = 0; i < n; ++i)
55 |     {
56 |         if (is_prime(nums[i]))
57 |             count++;
58 |     }
59 | 
60 |     cout << count;
61 | 
62 |     return 0;
63 | }
64 | total time: 36.91s for 231 tokens, tok/s: 6.26
65 | Press any key to continue . . .


--------------------------------------------------------------------------------
/src/matmul/matmul_priv.h:
--------------------------------------------------------------------------------
 1 | // common private header for matmul/conversion code
 2 | #define N_64K (1 << 16)
 3 | 
 4 | #define ABS_F16(x) ((x) & 0x7FFF)      // abs for F16/BF16/SF16, clear sign bit
 5 | 
 6 | // --------------------------------------
 7 | // data conversion to float 32 functions
 8 | 
 9 | extern const cvt_f16_to_f32_t cvt_f16_to_f32_procs[simd_n];
10 | extern const cvt_bf16_to_f32_t cvt_bf16_to_f32_procs[simd_n];
11 | extern const cvt_sf16_to_f32_t cvt_sf16_to_f32_procs[simd_n];
12 | 
13 | // --------------------------------------
14 | // vector to matrix multiply functions
15 | 
16 | extern const matmul_f32_f32_t matmul_f32_f32_procs[simd_n];
17 | extern const matmul_f32_f16_t matmul_f32_f16_procs[simd_n];
18 | extern const matmul_f32_bf16_t matmul_f32_bf16_procs[simd_n];
19 | extern const matmul_f32_sf16_t matmul_f32_sf16_procs[simd_n];
20 | extern const matmul_f32_f12_t matmul_f32_f12_procs[simd_n];
21 | extern const matmul_f32_f8_t matmul_f32_f8_procs[simd_n];
22 | 
23 | // --------------------------------------
24 | // SF16 conversions, code in matmul_sf16.c
25 | 
26 | void init_conv_sf16(void);
27 | void cvt_f16_to_sf16(sf16_t *sf16, const f16_t *f16, size_t ne);
28 | 
29 | // --------------------------------------
30 | // F16 conversions, code in matmul_f16.c
31 | 
32 | void init_sw_f16c(void);
33 | void free_sw_f16c(void);
34 | void cvt_f32_to_f16(f16_t *f16, const float *f32, size_t ne);
35 | 
36 | // --------------------------------------
37 | // F12 conversions, code in matmul_f12.c
38 | 
39 | void init_conv_f12(void);
40 | void cvt_f16_to_f12(f12_t *f12, const f16_t *f16, size_t ne);
41 | void cvt_bf16_to_f12(f12_t *f12, const f16_t *f16, size_t ne);
42 | 
43 | // --------------------------------------
44 | // F8 conversions, code in matmul_f8.c
45 | 
46 | void init_conv_f8(void);
47 | void cvt_f16_to_f8(f8_t *f8, const f16_t *f16, size_t ne);
48 | void cvt_bf16_to_f8(f8_t *f8, const bf16_t *bf16, size_t ne);


--------------------------------------------------------------------------------
/src/model/sampler.h:
--------------------------------------------------------------------------------
 1 | // ----------------------------------------------------------------------------
 2 | // The struct sampler_t, which takes logits and returns a sampled token
 3 | // sampling can be done in a few ways: greedy argmax, sampling, top-p sampling
 4 | 
 5 | // struct used when sorting probabilities during top-p sampling
 6 | struct prob_index_t
 7 | {
 8 |   float prob;
 9 |   int index;
10 | };
11 | 
12 | // sampler config
13 | struct sampler_conf_t
14 | {
15 |   float temperature;               // 0.0 to 2.0: 0:greedy decoding 2.0:maximum creativity
16 |   float topp;                      // 0.01 to 0.99: max probability sum of top tokens
17 |   int topk;                        // (integer) limit size of top tokens list 5..200 (0 = disable)
18 |   float topp_minp;                 // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
19 |   bool topp_eos;                   // true: limit topp list size to token with probability >= EOS
20 |   float repeat_penalty;            // 0.0..2.0 repeat penalty (0.0 = disable)
21 |   int repeat_penalty_n;            // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
22 |   float eos_amp;                   // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable)
23 |   int eos_amp_n;                   // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
24 |   int rand_seed;                   // (integer) random seed
25 |   char *ch_restrict;               // if string defined, define ascii + allowed chars list in sampled tokens.
26 | };
27 | 
28 | struct sampler_t
29 | {
30 |   struct sampler_conf_t conf;      // config
31 | 
32 |   uint64_t rng_state;
33 |   struct prob_index_t *probindex;  // buffer used in top-p sampling
34 |   int *tk_select;                  // binary array for restricted tokens, (NULL if unused)
35 | };
36 | 
37 | void build_sampler(void);
38 | void free_sampler(void);
39 | 
40 | // sample from transformer logits
41 | struct prob_index_t *sampler_sample(void);
42 | 


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_llama1.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_llama1.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: D:/llama1_st/7b/tokenizer.json
 5 | load transformer..
 6 | read model config in: D:/llama1_st/7b/config.json
 7 | n_kv_heads undefined, assumed = n_heads (32)
 8 | rope_theta undefined, expect rotary_emb.inv_freq contained in .safetensors
 9 | torch float type: fp16
10 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
11 | processor(s) core(s) used: 12 in 1 node(s).
12 | load: D:/llama1_st/7b/model-00001-of-00002.safetensors
13 | load: D:/llama1_st/7b/model-00002-of-00002.safetensors
14 | sampler config:
15 |   temperature      : 0.70
16 |   topp             : 0.75
17 |   topk             : 25
18 |   topp_minp        : 0.05
19 |   topp_eos         : true
20 |   repeat_penalty   : 0.05
21 |   repeat_penalty_n : 50
22 |   eos_amp          : 0.50
23 |   eos_amp_n        : 150
24 |   rand seed        : 1234
25 | Generate: max 2048 tokens..
26 | - Press 'esc' key to break generation.
27 | The explanation for the existence of seasons is a bit more complicated than the
28 | simple explanation that the Earth is tilted. It's actually a combination of the
29 | Earth's tilt and its orbit around the Sun.
30 | The Earth's axis is tilted at an angle of 23.5 degrees to the plane of its orbit
31 |  around the Sun. This means that the northern hemisphere is tilted toward the Su
32 | n in summer and away from the Sun in winter. The southern hemisphere is tilted a
33 | way from the Sun in summer and toward the Sun in winter.
34 | The Earth's orbit around the Sun is not a perfect circle. Instead, it's an ellip
35 | se, or a stretched-out circle. The Sun is not at the center of the ellipse. It's
36 |  at one of the foci, or the two points where the ellipse is widest.
37 | The Earth is closer to the Sun in January than in July. The distance between the
38 |  Earth and the Sun varies from about 91 million miles in January to about 94 mil
39 | lion miles in July.
40 | total time: 45.83s for 231 tokens, tok/s: 5.04
41 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/1_node/gen_f12/gen_llama1.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_llama1.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: D:/llama1_st/7b/tokenizer.json
 5 | load transformer..
 6 | read model config in: D:/llama1_st/7b/config.json
 7 | n_kv_heads undefined, assumed = n_heads (32)
 8 | rope_theta undefined, expect rotary_emb.inv_freq contained in .safetensors
 9 | torch float type: fp16
10 | model weights converted to float12.
11 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
12 | processor(s) core(s) used: 12 in 1 node(s).
13 | load: D:/llama1_st/7b/model-00001-of-00002.safetensors
14 | load: D:/llama1_st/7b/model-00002-of-00002.safetensors
15 | sampler config:
16 |   temperature      : 0.70
17 |   topp             : 0.75
18 |   topk             : 25
19 |   topp_minp        : 0.05
20 |   topp_eos         : true
21 |   repeat_penalty   : 0.05
22 |   repeat_penalty_n : 50
23 |   eos_amp          : 0.50
24 |   eos_amp_n        : 150
25 |   rand seed        : 1234
26 | Generate: max 2048 tokens..
27 | - Press 'esc' key to break generation.
28 | The explanation for the existence of seasons is a bit more complicated than the
29 | simple explanation that the Earth is tilted. It's actually a combination of the
30 | Earth's tilt and its orbit around the Sun.
31 | The Earth's axis is tilted at an angle of 23.5 degrees to the plane of its orbit
32 |  around the Sun. This means that the northern hemisphere is tilted toward the Su
33 | n in summer and away from the Sun in winter. The southern hemisphere is tilted a
34 | way from the Sun in summer and toward the Sun in winter.
35 | The Earth's orbit around the Sun is not a perfect circle. Instead, it's an ellip
36 | se, or a stretched-out circle. The Sun is not at the center of the ellipse. It's
37 |  at one of the foci, or the two points where the ellipse is widest.
38 | The Earth is closer to the Sun in January than in July. The distance between the
39 |  Earth and the Sun varies from about 91 million miles in January to about 94 mil
40 | lion miles in July.
41 | total time: 37.41s for 231 tokens, tok/s: 6.17
42 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/1_node/gen_f12/gen_tinyllama.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_tinyllama.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0/config.json
 7 | torch float type: bf16
 8 | model weights converted to float12.
 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
10 | processor(s) core(s) used: 12 in 1 node(s).
11 | load: E:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0/model.safetensors
12 | sampler config:
13 |   temperature      : 0.60
14 |   topp             : 0.65
15 |   topk             : 25
16 |   topp_minp        : 0.05
17 |   topp_eos         : true
18 |   repeat_penalty   : 0.05
19 |   repeat_penalty_n : 50
20 |   eos_amp          : 0.50
21 |   eos_amp_n        : 150
22 |   rand seed        : 1234
23 | Generate: max 2048 tokens..
24 | - Press 'esc' key to break generation.
25 | The explanation for the existence of seasons is a complex and multifaceted proce
26 | ss that involves the Earth's orbit around the Sun, the Earth's rotation around t
27 | he Sun, and the Earth's position relative to the Sun's position in space. The pr
28 | ocess of seasons is driven by the Earth's orbit around the Sun, which causes the
29 |  Earth's axis to tilt slightly relative to the direction of the Sun's travel. Th
30 | is tilt causes the Earth's surface to receive more or less solar radiation depen
31 | ding on the season. The Earth's rotation around the Sun also causes the Earth's
32 | magnetic field to vary, which affects the Earth's magnetic field and the way tha
33 | t energy is transported through the atmosphere. The Earth's position relative to
34 |  the Sun's position in space also affects the amount of energy that is absorbed
35 | and released by the Earth's atmosphere. Overall, the complex and multifaceted pr
36 | ocess of seasons is driven by the Earth's orbit around the Sun, the Earth's rota
37 | tion around the Sun, and the Earth's position relative to the Sun's position in
38 | space
39 | total time: 6.38s for 237 tokens, tok/s: 37.15
40 | Press any key to continue . . .


--------------------------------------------------------------------------------
/src/matmul/mm_hsum.h:
--------------------------------------------------------------------------------
 1 | #ifdef VS_2008
 2 | // old compiler. some SSE not defined in intrin. (_mm_cvtph_ps/_mm_cvtps_ph/_mm_fmadd_ps etc..)
 3 | // this use external linking. (slow but work)
 4 | #include "conv_ph_ps.h"
 5 | #endif
 6 | 
 7 | #ifndef MM_USE_FMA
 8 | #define _mm_fmadd_ps(a,b,c) _mm_add_ps(c,_mm_mul_ps(a,b))
 9 | #define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(c,_mm256_mul_ps(a,b))
10 | #endif
11 | 
12 | // ------------------------------------------------------------------
13 | // SSE3/AVX horizontal sum
14 | // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
15 | // ------------------------------------------------------------------
16 | 
17 | #if 1
18 | 
19 | static __inline float hsum_ps_sse(__m128 v)
20 | {
21 |   __m128 shuf = _mm_movehdup_ps(v);            // broadcast elements 3,1 to 2,0
22 |   __m128 sums = _mm_add_ps(v, shuf);
23 |   shuf        = _mm_movehl_ps(shuf, sums);     // high half -> low half
24 |   sums        = _mm_add_ss(sums, shuf);
25 |   return _mm_cvtss_f32(sums);
26 | }
27 | 
28 | static __inline float hsum_ps_avx1(__m256 v) 
29 | {
30 |   __m128 vlow  = _mm256_castps256_ps128(v);
31 |   __m128 vhigh = _mm256_extractf128_ps(v, 1);  // high 128
32 |          vlow  = _mm_add_ps(vlow, vhigh);      // add the low 128
33 |   return hsum_ps_sse(vlow);                    // and inline the sse3 version, which is optimal for AVX
34 | }
35 | 
36 | #else
37 | 
38 | // FPU.
39 | // note: can be faster than AVX/SSE versions (compiler optimized ?).
40 | 
41 | static __inline float hsum_ps_sse(__m128 v)
42 | {
43 |   float *sum_4 = (float *)&v;
44 |   return sum_4[0] + sum_4[1] + sum_4[2] + sum_4[3];
45 | }
46 | 
47 | static __inline float hsum_ps_avx1(__m256 v)
48 | {
49 |   float *sum_8 = (float *)&v;
50 |   return sum_8[0] + sum_8[1] + sum_8[2] + sum_8[3] + sum_8[4] + sum_8[5] + sum_8[6] + sum_8[7];
51 | }
52 | 
53 | #endif
54 | 
55 | #define hsum_ps_sse_2x(a,b) hsum_ps_sse(_mm_add_ps(a,b))
56 | #define hsum_ps_sse_4x(a,b,c,d) hsum_ps_sse(_mm_add_ps(_mm_add_ps(a,b),_mm_add_ps(c,d)))
57 | 
58 | #define hsum_ps_avx_2x(a,b) hsum_ps_avx1(_mm256_add_ps(a,b))
59 | #define hsum_ps_avx_4x(a,b,c,d) hsum_ps_avx1(_mm256_add_ps(_mm256_add_ps(a,b),_mm256_add_ps(c,d)))
60 | 
61 | 


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_llama2.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_llama2.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/llama2/llama2-7b-chat-hf/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/llama2/llama2-7b-chat-hf/config.json
 7 | rope_theta undefined, expect rotary_emb.inv_freq contained in .safetensors
 8 | torch float type: fp16
 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
10 | processor(s) core(s) used: 12 in 1 node(s).
11 | load: E:/llama2/llama2-7b-chat-hf/model-00001-of-00002.safetensors
12 | load: E:/llama2/llama2-7b-chat-hf/model-00002-of-00002.safetensors
13 | sampler config:
14 |   temperature      : 0.90
15 |   topp             : 0.80
16 |   topk             : 40
17 |   topp_minp        : 0.05
18 |   topp_eos         : true
19 |   repeat_penalty   : 0.05
20 |   repeat_penalty_n : 50
21 |   eos_amp          : 0.50
22 |   eos_amp_n        : 150
23 |   rand seed        : 1234
24 | Generate: max 4096 tokens..
25 | - Press 'esc' key to break generation.
26 | The explanation for the existence of seasons is due to the tilt of the Earth's a
27 | xis and its orbit around the sun.
28 | 
29 | The Earth's axis is tilted at an angle of approximately 23.5 degrees relative to
30 |  the plane of its orbit around the sun. This means that, as the Earth orbits the
31 |  sun, different parts of the planet are tilted towards or away from the sun, res
32 | ulting in changes in the amount of sunlight that reaches the Earth's surface.
33 | 
34 | During the summer months in the Northern Hemisphere, the Earth is tilted towards
35 |  the sun, resulting in longer days and more direct sunlight reaching the surface
36 | . This leads to warmer temperatures and longer days.
37 | 
38 | In contrast, during the winter months in the Northern Hemisphere, the Earth is t
39 | ilted away from the sun, resulting in shorter days and less direct sunlight reac
40 | hing the surface. This leads to colder temperatures and shorter days.
41 | 
42 | The same process occurs in the Southern Hemisphere, but with the opposite season
43 | s. When it is summer in the Northern Hemisphere, it is winter in the Southern He
44 | misphere, and vice versa.
45 | total time: 48.91s for 249 tokens, tok/s: 5.09
46 | Press any key to continue . . .


--------------------------------------------------------------------------------
/src/model/tr_opt_inc.c:
--------------------------------------------------------------------------------
 1 | // this file must be included in transformer.c if USE_THRD_BATCH defined.
 2 | // batch some tread work, allow to gain about 2 to 6 % speed
 3 | 
 4 | #ifdef INC_THRD_BATCH
 5 | 
 6 | // define qkv for self attention
 7 | static _inline void opt_compute_qkv(float *q, float *k, float *v, const float *xb, const struct transformer_weights_t *w, int layer_id, mm_proc_t matmul_lw)
 8 | {
 9 |   int n_thrd = numa_map.n_threads;
10 |   int i;
11 |   CHECK(n_thrd <= w->wk.wy);
12 | 
13 |   #pragma omp parallel for
14 |   for (i=0; i<n_thrd; i++)
15 |   {
16 |     const struct w_part_t *lp;
17 |     const char *_p;
18 |     int y = i*w->wk.dy;
19 |     int dy = WD_GET_DY(y, w->wk.dy, w->wk.wy);
20 |     size_t ofs;
21 | 
22 |     lp = &w->wk.lp[i];
23 |     ofs = (size_t)layer_id * lp->sz_l;   // same for k and v  (same wy)
24 |     _p = (const char *)lp->p + ofs;
25 |     matmul_lw(k + y, xb, _p, w->wk.wx, dy);
26 | 
27 |     lp = &w->wv.lp[i];
28 |     _p = (const char *)lp->p + ofs;
29 |     matmul_lw(v + y, xb, _p, w->wv.wx, dy);
30 | 
31 |     if (q)
32 |     {
33 |       y = i*w->wq.dy;
34 |       dy = WD_GET_DY(y, w->wq.dy, w->wq.wy);
35 |       lp = &w->wq.lp[i];
36 |       _p = (const char *)lp->p + (size_t)layer_id * lp->sz_l;
37 |       matmul_lw(q + y, xb, _p, w->wq.wx, dy);
38 |     }
39 |   }
40 | }
41 | 
42 | // in xb, work hb2, out hb
43 | static _inline void opt_compute_w1_w3_swiglu(float *hb, float *hb2, const float *xb, const struct transformer_weights_t *w, int layer_id, mm_proc_t matmul_lw)
44 | {
45 |   int n_thrd = numa_map.n_threads;
46 |   int i;
47 |   CHECK(n_thrd <= w->w1.wy);
48 | 
49 |   #pragma omp parallel for
50 |   for (i=0; i<n_thrd; i++)
51 |   {
52 |     const struct w_part_t *lp;
53 |     const char *_p;
54 |     int y = i*w->w1.dy;
55 |     int dy = WD_GET_DY(y, w->w1.dy, w->w1.wy);
56 |     int wx = w->w1.wx;
57 |     int x, x1;
58 |     size_t ofs;
59 | 
60 |     lp = &w->w1.lp[i];
61 |     ofs = (size_t)layer_id * lp->sz_l;   // same for w1/w3 (same wy)
62 |     _p = (const char *)lp->p + ofs;
63 |     matmul_lw(hb + y, xb, _p, wx, dy);
64 | 
65 |     lp = &w->w3.lp[i];
66 |     _p = (const char *)lp->p + ofs;
67 |     matmul_lw(hb2 + y, xb, _p, wx, dy);
68 | 
69 |     // swiglu
70 |     x1 = y+dy;
71 |     for (x=y; x<x1; x++)
72 |       hb[x] = swiglu(hb[x]) * hb2[x];
73 |   }
74 | }
75 | 
76 | #endif
77 | 


--------------------------------------------------------------------------------
/tests/1_node/gen_f12/gen_llama2.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_llama2.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/llama2/llama2-7b-chat-hf/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/llama2/llama2-7b-chat-hf/config.json
 7 | rope_theta undefined, expect rotary_emb.inv_freq contained in .safetensors
 8 | torch float type: fp16
 9 | model weights converted to float12.
10 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
11 | processor(s) core(s) used: 12 in 1 node(s).
12 | load: E:/llama2/llama2-7b-chat-hf/model-00001-of-00002.safetensors
13 | load: E:/llama2/llama2-7b-chat-hf/model-00002-of-00002.safetensors
14 | sampler config:
15 |   temperature      : 0.90
16 |   topp             : 0.80
17 |   topk             : 40
18 |   topp_minp        : 0.05
19 |   topp_eos         : true
20 |   repeat_penalty   : 0.05
21 |   repeat_penalty_n : 50
22 |   eos_amp          : 0.50
23 |   eos_amp_n        : 150
24 |   rand seed        : 1234
25 | Generate: max 4096 tokens..
26 | - Press 'esc' key to break generation.
27 | The explanation for the existence of seasons is due to the tilt of the Earth's a
28 | xis and its orbit around the sun.
29 | 
30 | The Earth's axis is tilted at an angle of approximately 23.5 degrees relative to
31 |  the plane of its orbit around the sun. This means that, as the Earth orbits the
32 |  sun, different parts of the planet are tilted towards or away from the sun, res
33 | ulting in changes in the amount of sunlight that reaches the Earth's surface.
34 | 
35 | During the summer months in the Northern Hemisphere, the Earth is tilted towards
36 |  the sun, resulting in longer days and more direct sunlight reaching the surface
37 | . This leads to warmer temperatures and longer days.
38 | 
39 | In contrast, during the winter months in the Northern Hemisphere, the Earth is t
40 | ilted away from the sun, resulting in shorter days and less direct sunlight reac
41 | hing the surface. This leads to colder temperatures and shorter days.
42 | 
43 | The same process occurs in the Southern Hemisphere, but with the opposite season
44 | s. When it is summer in the Northern Hemisphere, it is winter in the Southern He
45 | misphere, and vice versa.
46 | total time: 40.12s for 249 tokens, tok/s: 6.21
47 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_tinyllama.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_tinyllama.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0/config.json
 7 | torch float type: bf16
 8 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
 9 | processor(s) core(s) used: 12 in 1 node(s).
10 | load: E:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0/model.safetensors
11 | sampler config:
12 |   temperature      : 0.60
13 |   topp             : 0.65
14 |   topk             : 25
15 |   topp_minp        : 0.05
16 |   topp_eos         : true
17 |   repeat_penalty   : 0.05
18 |   repeat_penalty_n : 50
19 |   eos_amp          : 0.50
20 |   eos_amp_n        : 150
21 |   rand seed        : 1234
22 | Generate: max 2048 tokens..
23 | - Press 'esc' key to break generation.
24 | The explanation for the existence of seasons is a complex and multifaceted proce
25 | ss that involves the Earth's orbit around the Sun, the Earth's rotation around t
26 | he Sun, and the Earth's position relative to the Sun's position in space. The pr
27 | ocess of seasons is influenced by a variety of factors, including the Earth's ti
28 | lt on its axis, the Earth's distance from the Sun, and the Earth's rotation. The
29 |  Earth's tilt on its axis causes the Earth to rotate around its axis at a differ
30 | ent rate than the Earth's orbit around the Sun. This causes the seasons to vary
31 | in intensity and duration. The Earth's distance from the Sun also affects the se
32 | asons. As the Earth moves closer to the Sun, it experiences longer days and shor
33 | ter nights, which causes the Earth's tilt on its axis to change. This causes the
34 |  Earth to experience longer and shorter seasons. The Earth's rotation also plays
35 |  a role in the seasons. The Earth rotates around its axis at a rate of 23.5 degr
36 | ees per day, which causes the seasons to vary in intensity and duration. The Ear
37 | th's rotation also affects the amount of sunlight that reaches the Earth's surfa
38 | ce. The Earth's rotation causes the Earth's axis to tilt towards the Sun, which
39 | causes the Earth to receive more sunlight during the summer months and less sunl
40 | ight during the winter months.
41 | total time: 9.55s for 297 tokens, tok/s: 31.11
42 | Press any key to continue . . .


--------------------------------------------------------------------------------
/make_gcc.txt:
--------------------------------------------------------------------------------
 1 | # makefile to build llama_st (llama_st.exe) using GCC for windows (w64devkit used)
 2 | # usage: make -f make_gcc.txt all
 3 | # llama2 test example:
 4 | #  - edit run_json/run_llama2.json (using utf8 editor, ex programmer notepad)
 5 | #    - define model_path to your local path where model .safetensors files are loaded
 6 | #    - define model_num_safetensors to num of .safetensors files used by model
 7 | #    - define run_mode to 0 or 1 (generate or chat mode)
 8 | # run command line: llama_st run_json/run_llama2.json
 9 | 
10 | DEF =  -D_GCC_BLD         # add some function ptr cast to avoid warning
11 | DEF += -DPACK_KV_CACHE    # transformer: enable kv cache compact option
12 | DEF += -DUSE_THRD_BATCH   # transformer: use threads more optimized code (less readable but increase speed)
13 | DEF += -DUSE_SA_SIMD      # transformer: use simd code for self attention
14 | # DEF += -DMM_USE_FMA       # use FMA (cpu support not checked, no speed gain observed (or even loss ?) if used)
15 | # DEF += -DCHECK_ALLOC      # check memory allocation/deallocations (debug)
16 | # DEF += -D_DEBUG           # include some debug check code (slower, new models test in case of errors)
17 | # DEF += -DCHECK_EXIT       # include program exit check code
18 | 
19 | INC = -Isrc/utils -Isrc -Isrc/matmul -Isrc/model/load -Isrc/model
20 | 
21 | #matmul
22 | SRC  += src/matmul/matmul.c
23 | SRC  += src/matmul/matmul_f8.c
24 | SRC  += src/matmul/matmul_f12.c
25 | SRC  += src/matmul/matmul_f16.c
26 | SRC  += src/matmul/matmul_bf16.c
27 | SRC  += src/matmul/matmul_sf16.c
28 | SRC  += src/matmul/matmul_f32.c
29 | SRC  += src/matmul/tr_opt_simd.c
30 | 
31 | #load
32 | SRC  += src/model/load/json.c
33 | SRC  += src/model/load/load_tokenizer.c
34 | SRC  += src/model/load/load_transformer.c
35 | 
36 | #model
37 | SRC  += src/model/model.c
38 | SRC  += src/model/omp_numa.c
39 | SRC  += src/model/sampler.c
40 | SRC  += src/model/tokenizer.c
41 | SRC  += src/model/transformer.c
42 | SRC  += src/model/kv_cache.c
43 | 
44 | #utils
45 | SRC  += src/utils/l_util.c
46 | SRC  += src/utils/mem_alloc.c
47 | SRC  += src/utils/numa_w.c
48 | SRC  += src/utils/term_utf8_w.c
49 | SRC  += src/utils/time_ev.c
50 | SRC  += src/utils/utf8.c
51 | 
52 | #src
53 | SRC  += src/chat.c
54 | SRC  += src/generate.c
55 | SRC  += src/main.c
56 | 
57 | COMP = gcc -fopenmp -m64 -march=native -O3 $(DEF) $(INC)
58 | 
59 | all: 
60 | 	$(COMP) $(SRC) -o llama_st.exe
61 | 


--------------------------------------------------------------------------------
/src/model/load/json.h:
--------------------------------------------------------------------------------
 1 | // json file read
 2 | 
 3 | // user info 
 4 | struct js_read_inf_t
 5 | {
 6 |   int lst_id;                      // into { key id
 7 |   int arr_id;                      // into [ value id, -1 if not array
 8 |   bool is_lev_end;                 // last read is end of list into { } bloc
 9 |   bool is_arr_end;                 // last read is end of an array into [ ] bloc
10 |   bool file_eof;                   // last read is end of file
11 | };
12 | 
13 | // define handle and init mem base
14 | struct h_json_t *js_get_handle(char *mem, bool ext_mode);
15 | 
16 | // file read
17 | struct h_json_t *js_load_file(const char *file_name, bool ext_mode);
18 | 
19 | // free mem
20 | void js_close(struct h_json_t *h);
21 | 
22 | // reset state, seek to start of file
23 | void js_seek_origin(struct h_json_t *h);
24 | 
25 | // param read
26 | int js_read_param(struct h_json_t *h, struct js_read_inf_t *inf);
27 | 
28 | // key identifiers
29 | char *js_copy_key_ident_str(const struct h_json_t *h, char *d, int d_size, int id);
30 | bool js_cmp_key_ident(struct h_json_t *h, const char *key, int id);
31 | char *js_get_key_ident_str_tmp(const struct h_json_t *h, int id);
32 | 
33 | // key value
34 | char *js_copy_key_value_str(const struct h_json_t *h, char *d, int d_size);
35 | char *js_get_key_value_str_alloc(const struct h_json_t *h);
36 | char *js_get_key_value_str_tmp(const struct h_json_t *h);
37 | 
38 | // get key value as numeric
39 | bool js_get_num_value_bool(const struct h_json_t *h);
40 | int64_t js_get_num_value_i64(const struct h_json_t *h);
41 | int js_get_num_value_i32(const struct h_json_t *h);
42 | float js_get_num_value_f32(const struct h_json_t *h);
43 | 
44 | // find key in file
45 | bool js_cmp_key_list(const struct h_json_t *h, const char *key_list);
46 | bool js_find_key_list(struct h_json_t *h, const char *key_list);
47 | void js_find_key_list_check(struct h_json_t *h, const char *key_list);
48 | 
49 | // find and read an array of integers, return array len, -1 if not found
50 | int js_find_read_int_array_key_list(struct h_json_t *h, const char *key_list, int *arr, int arr_max);
51 | 
52 | // compare key value
53 | bool js_cmp_key_value_str(struct h_json_t *h, const char *str);
54 | void js_check_key_value_str(struct h_json_t *h, const char *str);
55 | 
56 | // ----------
57 | // print util
58 | 
59 | void js_print_param(const struct h_json_t *h, const struct js_read_inf_t *inf);
60 | void js_dump_file(const char *file_name, bool ext_mode);
61 | 


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_mathstral_fp32.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_mathstral.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/mathstral/Mathstral-7B-v0.1/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/mathstral/Mathstral-7B-v0.1/config.json
 7 | torch float type: fp32
 8 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
 9 | processor(s) core(s) used: 12 in 1 node(s).
10 | load: E:/mathstral/Mathstral-7B-v0.1/model-00001-of-00006.safetensors
11 | load: E:/mathstral/Mathstral-7B-v0.1/model-00002-of-00006.safetensors
12 | load: E:/mathstral/Mathstral-7B-v0.1/model-00003-of-00006.safetensors
13 | load: E:/mathstral/Mathstral-7B-v0.1/model-00004-of-00006.safetensors
14 | load: E:/mathstral/Mathstral-7B-v0.1/model-00005-of-00006.safetensors
15 | load: E:/mathstral/Mathstral-7B-v0.1/model-00006-of-00006.safetensors
16 | sampler config:
17 |   temperature      : 0.70
18 |   topp             : 0.80
19 |   topk             : 40
20 |   topp_minp        : 0.05
21 |   topp_eos         : true
22 |   repeat_penalty   : 0.05
23 |   repeat_penalty_n : 50
24 |   eos_amp          : 0.50
25 |   eos_amp_n        : 150
26 |   rand seed        : 1234
27 | Generate: max 32768 tokens..
28 | - Press 'esc' key to break generation.
29 | The explanation for the existence of seasons is related to the Earth's tilt and
30 | its orbit around the Sun.
31 | 
32 | Seasons occur because the Earth's axis is tilted at an angle of about 23.5 degre
33 | es relative to the plane of its orbit around the Sun. This tilt causes the amoun
34 | t of solar energy received by different parts of the Earth to vary throughout th
35 | e year, leading to the different seasons.
36 | 
37 | During the summer, the North Pole is tilted towards the Sun, and the Tropic of C
38 | ancer (23.5 degrees north latitude) receives the most direct sunlight. This caus
39 | es the northern hemisphere to experience longer days and warmer temperatures, re
40 | sulting in summer.
41 | 
42 | In the winter, the opposite happens. The North Pole is tilted away from the Sun,
43 |  and the Tropic of Capricorn (23.5 degrees south latitude) receives the most dir
44 | ect sunlight. This causes the southern hemisphere to experience longer days and
45 | warmer temperatures, resulting in summer there, while the northern hemisphere ex
46 | periences shorter days and colder temperatures, resulting in winter.
47 | total time: 96.16s for 231 tokens, tok/s: 2.40
48 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/1_node/gen_f12/gen_zephyr.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_zephyr.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/zephyr/zephyr-7b-beta/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/zephyr/zephyr-7b-beta/config.json
 7 | torch float type: bf16
 8 | model weights converted to float12.
 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
10 | processor(s) core(s) used: 12 in 1 node(s).
11 | load: E:/zephyr/zephyr-7b-beta/model-00001-of-00008.safetensors
12 | load: E:/zephyr/zephyr-7b-beta/model-00002-of-00008.safetensors
13 | load: E:/zephyr/zephyr-7b-beta/model-00003-of-00008.safetensors
14 | load: E:/zephyr/zephyr-7b-beta/model-00004-of-00008.safetensors
15 | load: E:/zephyr/zephyr-7b-beta/model-00005-of-00008.safetensors
16 | load: E:/zephyr/zephyr-7b-beta/model-00006-of-00008.safetensors
17 | load: E:/zephyr/zephyr-7b-beta/model-00007-of-00008.safetensors
18 | load: E:/zephyr/zephyr-7b-beta/model-00008-of-00008.safetensors
19 | sampler config:
20 |   temperature      : 0.70
21 |   topp             : 0.80
22 |   topk             : 40
23 |   topp_minp        : 0.05
24 |   topp_eos         : true
25 |   repeat_penalty   : 0.05
26 |   repeat_penalty_n : 50
27 |   eos_amp          : 0.50
28 |   eos_amp_n        : 150
29 |   rand seed        : 1234
30 | Generate: max 32768 tokens..
31 | - Press 'esc' key to break generation.
32 | The explanation for the existence of seasons is based on the Earth's tilt. The E
33 | arth is tilted at an angle of 23.5 degrees from its axis, which is the imaginary
34 |  line that runs from the North Pole to the South Pole. This tilt causes the amou
35 | nt of sunlight that reaches different parts of the Earth to vary throughout the
36 | year.
37 | 
38 | During the summer months in the Northern Hemisphere, the North Pole is tilted to
39 | wards the sun, which results in longer days and more sunlight. This extra sunlig
40 | ht causes the temperature to rise, leading to the warmer weather that we associa
41 | te with summer.
42 | 
43 | In contrast, during the winter months in the Northern Hemisphere, the North Pole
44 |  is tilted away from the sun, which results in shorter days and less sunlight. T
45 | his lack of sunlight causes the temperature to drop, leading to the colder weath
46 | er that we associate with winter.
47 | 
48 | The same pattern occurs in the Southern Hemisphere, but with the opposite season
49 | s. During the summer months in the Southern Hemisphere, the South Pole is tilted
50 |  towards the sun, and during the winter months, the South Pole is tilted away fr
51 | om the sun.
52 | total time: 41.55s for 247 tokens, tok/s: 5.95
53 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_llama3.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_llama3.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: D:/llama3_st/8b-instruct/tokenizer.json
 5 | load transformer..
 6 | read model config in: D:/llama3_st/8b-instruct/config.json
 7 | torch float type: bf16
 8 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
 9 | processor(s) core(s) used: 12 in 1 node(s).
10 | load: D:/llama3_st/8b-instruct/model-00001-of-00004.safetensors
11 | load: D:/llama3_st/8b-instruct/model-00002-of-00004.safetensors
12 | load: D:/llama3_st/8b-instruct/model-00003-of-00004.safetensors
13 | load: D:/llama3_st/8b-instruct/model-00004-of-00004.safetensors
14 | sampler config:
15 |   temperature      : 1.00
16 |   topp             : 0.65
17 |   topk             : 40
18 |   topp_minp        : 0.05
19 |   topp_eos         : false
20 |   repeat_penalty   : 0.00
21 |   repeat_penalty_n : 0
22 |   eos_amp          : 0.50
23 |   eos_amp_n        : 250
24 |   rand seed        : 1234
25 | Generate: max 8192 tokens..
26 | - Press 'esc' key to break generation.
27 | The explanation for the existence of seasons is that the Earth's axis is tilted
28 | at an angle of about 23.5 degrees relative to its orbit around the Sun. This til
29 | t causes the amount of sunlight that reaches the Earth's surface to vary through
30 | out the year, resulting in the changing seasons.
31 | 
32 | During the summer months, the Northern Hemisphere is tilted towards the Sun, rec
33 | eiving more direct sunlight and experiencing longer days. This results in warmer
34 |  temperatures and longer days, making it the warmest season.
35 | 
36 | In contrast, during the winter months, the Northern Hemisphere is tilted away fr
37 | om the Sun, receiving less direct sunlight and experiencing shorter days. This r
38 | esults in colder temperatures and shorter days, making it the coldest season.
39 | 
40 | The same principle applies to the Southern Hemisphere, where the seasons are rev
41 | ersed due to its opposite tilt. When it's summer in the Northern Hemisphere, it'
42 | s winter in the Southern Hemisphere, and vice versa.
43 | 
44 | The tilt of the Earth's axis also affects the distribution of sunlight throughou
45 | t the year, resulting in the changing lengths of daylight and nighttime. This, i
46 | n turn, influences the climate and weather patterns, making the seasons distinct
47 |  and unique.
48 | 
49 | In summary, the Earth's tilt is the primary reason for the existence of seasons,
50 |  and it's what gives us the varying temperatures, daylight hours, and weather pa
51 | tterns that we experience throughout the year.
52 | total time: 59.51s for 268 tokens, tok/s: 4.50
53 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_mistral.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_mistral.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/mistral/mistral-7b-instruct-v0.3/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/mistral/mistral-7b-instruct-v0.3/config.json
 7 | torch float type: bf16
 8 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
 9 | processor(s) core(s) used: 12 in 1 node(s).
10 | load: E:/mistral/mistral-7b-instruct-v0.3/model-00001-of-00003.safetensors
11 | load: E:/mistral/mistral-7b-instruct-v0.3/model-00002-of-00003.safetensors
12 | load: E:/mistral/mistral-7b-instruct-v0.3/model-00003-of-00003.safetensors
13 | sampler config:
14 |   temperature      : 0.70
15 |   topp             : 0.80
16 |   topk             : 40
17 |   topp_minp        : 0.05
18 |   topp_eos         : true
19 |   repeat_penalty   : 0.05
20 |   repeat_penalty_n : 50
21 |   eos_amp          : 0.50
22 |   eos_amp_n        : 150
23 |   rand seed        : 1234
24 | Generate: max 32768 tokens..
25 | - Press 'esc' key to break generation.
26 | The explanation for the existence of seasons is based on the tilt of the Earth's
27 |  axis. The Earth is not a perfect sphere, but is slightly flattened at the poles
28 |  and bulging at the equator. The Earth's axis of rotation is tilted at an angle
29 | of about 23.5 degrees relative to its orbit around the sun. This tilt is respons
30 | ible for the variation in the amount of sunlight received by different parts of
31 | the Earth, and therefore for the seasons.
32 | 
33 | During the summer solstice, the North Pole is tilted towards the sun and receive
34 | s the maximum amount of sunlight, while the South Pole is tilted away and receiv
35 | es the minimum amount of sunlight. Conversely, during the winter solstice, the N
36 | orth Pole is tilted away from the sun and receives the minimum amount of sunligh
37 | t, while the South Pole is tilted towards the sun and receives the maximum amoun
38 | t of sunlight.
39 | 
40 | The tilt of the Earth's axis causes the sun to appear to move in a circular path
41 |  across the sky, with the highest point of the path (the solar noon) moving nort
42 | hward during the summer and southward during the winter. This movement of the so
43 | lar noon is what causes the seasons.
44 | 
45 | During the summer, the solar noon is located further north, which means that the
46 |  sun is higher in the sky and shines for a longer period of time. This results i
47 | n warmer temperatures and longer days. During the winter, the solar noon is loca
48 | ted further south, which means that the sun is lower in the sky and shines for a
49 |  shorter period
50 | total time: 70.23s for 331 tokens, tok/s: 4.71
51 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/1_node/gen_f12/gen_llama3.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_llama3.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: D:/llama3_st/8b-instruct/tokenizer.json
 5 | load transformer..
 6 | read model config in: D:/llama3_st/8b-instruct/config.json
 7 | torch float type: bf16
 8 | model weights converted to float12.
 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
10 | processor(s) core(s) used: 12 in 1 node(s).
11 | load: D:/llama3_st/8b-instruct/model-00001-of-00004.safetensors
12 | load: D:/llama3_st/8b-instruct/model-00002-of-00004.safetensors
13 | load: D:/llama3_st/8b-instruct/model-00003-of-00004.safetensors
14 | load: D:/llama3_st/8b-instruct/model-00004-of-00004.safetensors
15 | sampler config:
16 |   temperature      : 1.00
17 |   topp             : 0.65
18 |   topk             : 40
19 |   topp_minp        : 0.05
20 |   topp_eos         : false
21 |   repeat_penalty   : 0.00
22 |   repeat_penalty_n : 0
23 |   eos_amp          : 0.50
24 |   eos_amp_n        : 250
25 |   rand seed        : 1234
26 | Generate: max 8192 tokens..
27 | - Press 'esc' key to break generation.
28 | The explanation for the existence of seasons is that the Earth's axis is tilted
29 | at an angle of about 23.5 degrees relative to its orbit around the Sun. This til
30 | t causes the amount of sunlight that reaches the Earth's surface to vary through
31 | out the year, resulting in the changing seasons.
32 | 
33 | During the summer months, the Northern Hemisphere is tilted towards the Sun, rec
34 | eiving more direct sunlight and experiencing longer days. This results in warmer
35 |  temperatures and longer days, making it the warmest season.
36 | 
37 | In contrast, during the winter months, the Northern Hemisphere is tilted away fr
38 | om the Sun, receiving less direct sunlight and experiencing shorter days. This r
39 | esults in colder temperatures and shorter days, making it the coldest season.
40 | 
41 | The same principle applies to the Southern Hemisphere, where the seasons are rev
42 | ersed due to its opposite tilt. When it's summer in the Northern Hemisphere, it'
43 | s winter in the Southern Hemisphere, and vice versa.
44 | 
45 | The tilt of the Earth's axis also affects the distribution of sunlight throughou
46 | t the year, resulting in the changing lengths of daylight and nighttime. This, i
47 | n turn, influences the climate and weather patterns, making the seasons distinct
48 |  and unique.
49 | 
50 | In summary, the Earth's tilt is the primary reason for the existence of seasons,
51 |  and it's what gives us the varying temperatures, daylight hours, and weather pa
52 | tterns that we experience throughout the year.
53 | total time: 48.45s for 268 tokens, tok/s: 5.53
54 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_zephyr.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_zephyr.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/zephyr/zephyr-7b-beta/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/zephyr/zephyr-7b-beta/config.json
 7 | torch float type: bf16
 8 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
 9 | processor(s) core(s) used: 12 in 1 node(s).
10 | load: E:/zephyr/zephyr-7b-beta/model-00001-of-00008.safetensors
11 | load: E:/zephyr/zephyr-7b-beta/model-00002-of-00008.safetensors
12 | load: E:/zephyr/zephyr-7b-beta/model-00003-of-00008.safetensors
13 | load: E:/zephyr/zephyr-7b-beta/model-00004-of-00008.safetensors
14 | load: E:/zephyr/zephyr-7b-beta/model-00005-of-00008.safetensors
15 | load: E:/zephyr/zephyr-7b-beta/model-00006-of-00008.safetensors
16 | load: E:/zephyr/zephyr-7b-beta/model-00007-of-00008.safetensors
17 | load: E:/zephyr/zephyr-7b-beta/model-00008-of-00008.safetensors
18 | sampler config:
19 |   temperature      : 0.70
20 |   topp             : 0.80
21 |   topk             : 40
22 |   topp_minp        : 0.05
23 |   topp_eos         : true
24 |   repeat_penalty   : 0.05
25 |   repeat_penalty_n : 50
26 |   eos_amp          : 0.50
27 |   eos_amp_n        : 150
28 |   rand seed        : 1234
29 | Generate: max 32768 tokens..
30 | - Press 'esc' key to break generation.
31 | The explanation for the existence of seasons is based on the Earth's tilt. The E
32 | arth is tilted at an angle of 23.5 degrees from its axis of rotation. This tilt
33 | causes the Earth's hemispheres to receive different amounts of solar radiation a
34 | t different times of the year.
35 | 
36 | In the Northern Hemisphere, during the summer months, the tilt causes the sun's
37 | rays to hit the Earth at a more direct angle, which results in longer days and m
38 | ore sunlight. This extra sunlight causes the temperature to rise, resulting in w
39 | armer weather and the growth of vegetation.
40 | 
41 | In the winter months, the tilt causes the sun's rays to hit the Earth at a less
42 | direct angle, which results in shorter days and less sunlight. This lack of sunl
43 | ight causes the temperature to drop, resulting in colder weather and the death o
44 | f vegetation.
45 | 
46 | In the Southern Hemisphere, the opposite is true. During the summer months, the
47 | sun's rays hit the Earth at a less direct angle, resulting in shorter days and l
48 | ess sunlight. This lack of sunlight causes the temperature to drop, resulting in
49 |  colder weather and the death of vegetation.
50 | 
51 | In the winter months, the sun's rays hit the Earth at a more direct angle, resul
52 | ting in longer days and more sunlight. This extra sunlight causes the temperatur
53 | e to rise, resulting in warmer weather and the growth
54 | total time: 61.60s for 294 tokens, tok/s: 4.77
55 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/1_node/gen_f12/gen_mistral.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_mistral.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/mistral/mistral-7b-instruct-v0.3/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/mistral/mistral-7b-instruct-v0.3/config.json
 7 | torch float type: bf16
 8 | model weights converted to float12.
 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
10 | processor(s) core(s) used: 12 in 1 node(s).
11 | load: E:/mistral/mistral-7b-instruct-v0.3/model-00001-of-00003.safetensors
12 | load: E:/mistral/mistral-7b-instruct-v0.3/model-00002-of-00003.safetensors
13 | load: E:/mistral/mistral-7b-instruct-v0.3/model-00003-of-00003.safetensors
14 | sampler config:
15 |   temperature      : 0.70
16 |   topp             : 0.80
17 |   topk             : 40
18 |   topp_minp        : 0.05
19 |   topp_eos         : true
20 |   repeat_penalty   : 0.05
21 |   repeat_penalty_n : 50
22 |   eos_amp          : 0.50
23 |   eos_amp_n        : 150
24 |   rand seed        : 1234
25 | Generate: max 32768 tokens..
26 | - Press 'esc' key to break generation.
27 | The explanation for the existence of seasons is based on the tilt of the Earth's
28 |  axis. The Earth is not a perfect sphere, but is slightly flattened at the poles
29 |  and bulging at the equator. The Earth's axis of rotation is tilted at an angle
30 | of about 23.5 degrees relative to its orbit around the sun. This tilt is respons
31 | ible for the variation in the amount of sunlight received by different parts of
32 | the Earth, and therefore for the seasons.
33 | 
34 | During the summer solstice, the North Pole is tilted towards the sun and receive
35 | s the most sunlight of any point on Earth. The sun is directly overhead at noon
36 | at the Tropic of Cancer, which is located at 23.5 degrees north latitude. The da
37 | ys are longest and the nights are shortest at this time, and the sun rises and s
38 | ets in a northerly direction.
39 | 
40 | Conversely, during the winter solstice, the South Pole is tilted away from the s
41 | un and receives the least sunlight of any point on Earth. The sun is directly ov
42 | erhead at noon at the Tropic of Capricorn, which is located at 23.5 degrees sout
43 | h latitude. The days are shortest and the nights are longest at this time, and t
44 | he sun rises and sets in a southerly direction.
45 | 
46 | The tilt of the Earth's axis also causes the length of day and night to vary thr
47 | oughout the year at different latitudes. At the equator, the length of day and n
48 | ight is almost exactly 12 hours all year round, because the sun is directly over
49 | head at noon. As you move towards the poles, the length of day and night becomes
50 |  longer in the summer and shorter in the winter. At the poles, the sun does not
51 | set at all during the summer solstice, and does not rise at all during the winte
52 | r solstice
53 | total time: 67.29s for 396 tokens, tok/s: 5.88
54 | Press any key to continue . . .


--------------------------------------------------------------------------------
/src/matmul/matmul.h:
--------------------------------------------------------------------------------
 1 | 
 2 | // simd code type
 3 | enum e_simd_typ
 4 | {
 5 |   simd_fpu = 0,
 6 |   simd_sse,
 7 |   simd_avx1,
 8 |   simd_avx2,
 9 |   simd_n,
10 | };
11 | 
12 | #define SIMD_LV 32                     // max used len_vec/or ne stride in matmul_xx/cvt_xx functions
13 | 
14 | // ---------------------------
15 | // data conversion to float 32
16 | 
17 | // float16 to float32 conversion
18 | typedef void (* cvt_f16_to_f32_t)(float *f32, const f16_t *f16, size_t ne);
19 | 
20 | // bfloat16 to float32 conversion
21 | typedef void (* cvt_bf16_to_f32_t)(float *f32, const bf16_t *bf16, size_t ne);
22 | 
23 | // sfloat16 to float32 conversion
24 | typedef void (* cvt_sf16_to_f32_t)(float *f32, const sf16_t *sf16, size_t ne);
25 | 
26 | // float12 to float32 conversion
27 | typedef void (* cvt_f12_to_f32_t)(float *f32, const f12_t *f12, size_t ne);
28 | 
29 | // float12 and float8 to float32 conversion
30 | // => not coded because this conversion in used only in matmul_f32_f12_procs / matmul_f32_f8_procs
31 | 
32 | // -----------------------------------
33 | // vector to matrix multiply functions
34 | 
35 | // float32 * float32 => float32
36 | typedef void (* matmul_f32_f32_t)(float *res, const float *vec, const float *mat, int len_vec, int y_mat);
37 | 
38 | // float32 * float16 => float32
39 | typedef void (* matmul_f32_f16_t)(float *res, const float *vec, const f16_t *mat, int len_vec, int y_mat);
40 | 
41 | // float32 * bfloat16 => float32
42 | typedef void (* matmul_f32_bf16_t)(float *res, const float *vec, const bf16_t *mat, int len_vec, int y_mat);
43 | 
44 | // float32 * sfloat16 => float32
45 | typedef void (* matmul_f32_sf16_t)(float *res, const float *vec, const sf16_t *mat, int len_vec, int y_mat);
46 | 
47 | // float32 * float12 => float32
48 | typedef void (* matmul_f32_f12_t)(float *res, const float *vec, const f12_t *mat, int len_vec, int y_mat);
49 | 
50 | // float32 * float8 => float32
51 | typedef void (* matmul_f32_f8_t)(float *res, const float *vec, const f8_t *mat, int len_vec, int y_mat);
52 | 
53 | // list of functions
54 | struct matmul_procs_t
55 | {
56 |   // convert
57 |   cvt_f16_to_f32_t  cvt_f16_to_f32;
58 |   cvt_bf16_to_f32_t cvt_bf16_to_f32;
59 |   cvt_sf16_to_f32_t cvt_sf16_to_f32;
60 |   cvt_f12_to_f32_t  cvt_f12_to_f32;
61 | 
62 |   // matmul
63 |   matmul_f32_f32_t  matmul_f32_f32;
64 |   matmul_f32_f16_t  matmul_f32_f16;
65 |   matmul_f32_bf16_t matmul_f32_bf16;
66 |   matmul_f32_sf16_t matmul_f32_sf16;
67 |   matmul_f32_f12_t  matmul_f32_f12;
68 |   matmul_f32_f8_t   matmul_f32_f8;
69 | 
70 |   // infos
71 |   enum e_simd_typ simd_set;    // initialized mode
72 |   int cpu_f16c;                // 1: f16c support
73 | };
74 | 
75 | // interface
76 | extern struct matmul_procs_t matmul_procs;
77 | 
78 | // generic data types conversions
79 | void cvt_w_data(void *d, enum e_w_type d_type, const void *s, enum e_w_type s_type, size_t ne);
80 | 
81 | // init
82 | void matmul_init(enum e_simd_typ simd_typ);
83 | 
84 | // free some memory
85 | void matmul_exit(void);
86 | 


--------------------------------------------------------------------------------
/src/model/tokenizer.h:
--------------------------------------------------------------------------------
 1 | // strings dictionnary
 2 | struct str_dic_t
 3 | {
 4 |   char *buff;                          // all strings 0 ended concatenated
 5 |   int sz_alloc;                        // buff alloc size in bytes
 6 |   int wr_ofs;                          // write offset in buff
 7 |   int n_strings;                       // count of strings in buff
 8 | };
 9 | 
10 | // token
11 | struct tok_index_t
12 | {
13 |   const char *str;                     // to string in tokenizer_t dic_tokens
14 |   int tok_id;                          // origin tokenizer index (non sorted list)
15 |   int id_to_sort;                      // origin index to sorted position (tok_id to str convert)
16 | };
17 | 
18 | // merge tokens id
19 | struct merge_id_t
20 | {
21 |   int tok_id_l;                        // left token id
22 |   int tok_id_r;                        // right token id
23 |   int tok_id_m;                        // merged token id
24 |   int merge_id;                        // id in merge list
25 | };
26 | 
27 | // token list element used by tokenizer encode
28 | struct m_tok_t
29 | {
30 |   int score;                           // merge score with righ token
31 |   int tok_id;                          // token id
32 |   int tok_id_m;                        // token id if merge with righ token
33 | };
34 | 
35 | // merge tokens id list
36 | struct mt_list_t
37 | {
38 |   struct m_tok_t *mt;
39 |   int n_list;
40 |   int n_alloc;
41 | };
42 | 
43 | // BPE tokenizer datas
44 | struct tokenizer_t
45 | {
46 |   bool mode_ll3;                       // llama3 tokenizer mode
47 | 
48 |   struct str_dic_t dic_tokens;         // token string list
49 |   struct tok_index_t *tok_index;       // token id list
50 |   int tok_index_list_size;
51 |   
52 |   // merge list
53 |   struct
54 |   {
55 |     struct merge_id_t *id_list;        // merge tokens id
56 |     int list_size;
57 |     int n_alloc;
58 |   } merge;
59 | 
60 |   int id_special_base;                 // start index of special tokens not in model.vocab list
61 |   int id_special_last;                 // last special tokens index
62 |   int id_special_count;                // count of special tokens
63 | 
64 |   // byte fallback + strips leading whitespace specific (see PR #89), active if not mode_ll3
65 |   int token_id_bos_ws;                 
66 |   int token_id_0x0;                    // <0x00> byte fallback
67 |   int token_id_0xff;                   // <0xff> byte fallback
68 | 
69 |   // tokenizer_encode token list result
70 |   struct mt_list_t mt_list;
71 | };
72 | 
73 | // find a token id from utf8 string, return -1 if not found
74 | int tokenizer_find_token_id(const char *str);
75 | 
76 | // find a special token from string and check is special token
77 | int tokenizer_find_sp_token_id(const char *str);
78 | 
79 | // return token string from token index
80 | const char *tokenizer_get_token_str(int token_id);
81 | 
82 | // encode text, define mt_list token list
83 | void tokenizer_encode(const char *text);
84 | 
85 | // return decoded token string.
86 | const char *tokenizer_decode(int token_id);
87 | 
88 | // decode and print
89 | void tokenizer_decode_print(int token_id, bool disp_raw);
90 | 
91 | // load and init tokenizer from .json file
92 | void build_tokenizer(void);
93 | 
94 | // free allocated mem
95 | void free_tokenizer(void);
96 | 


--------------------------------------------------------------------------------
/tests/1_node/gen_f12/gen_llama31.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_llama3.1.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: D:/llama3.1_st/8b-instruct/tokenizer.json
 5 | load transformer..
 6 | read model config in: D:/llama3.1_st/8b-instruct/config.json
 7 | torch float type: bf16
 8 | model weights converted to float12.
 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
10 | processor(s) core(s) used: 12 in 1 node(s).
11 | load: D:/llama3.1_st/8b-instruct/model-00001-of-00004.safetensors
12 | load: D:/llama3.1_st/8b-instruct/model-00002-of-00004.safetensors
13 | load: D:/llama3.1_st/8b-instruct/model-00003-of-00004.safetensors
14 | load: D:/llama3.1_st/8b-instruct/model-00004-of-00004.safetensors
15 | sampler config:
16 |   temperature      : 0.90
17 |   topp             : 0.85
18 |   topk             : 30
19 |   topp_minp        : 0.05
20 |   topp_eos         : true
21 |   repeat_penalty   : 0.05
22 |   repeat_penalty_n : 100
23 |   eos_amp          : 0.50
24 |   eos_amp_n        : 250
25 |   rand seed        : 1234
26 | Generate: max 131072 tokens..
27 | - Press 'esc' key to break generation.
28 | The explanation for the existence of seasons is rooted in the Earth's tilt and o
29 | rbit around the Sun. The tilt of the Earth is approximately 23.5 degrees, which
30 | causes the amount of sunlight that reaches the planet's surface to vary througho
31 | ut the year. During the summer months, the Northern Hemisphere is tilted towards
32 |  the Sun, resulting in longer days and more direct sunlight. Conversely, during
33 | the winter months, the Northern Hemisphere is tilted away from the Sun, resultin
34 | g in shorter days and less direct sunlight.
35 | This tilt is the primary cause of the changing seasons, with the Earth's orbit a
36 | round the Sun also playing a role. The Earth's orbit is elliptical, meaning that
37 |  its distance from the Sun varies throughout the year. However, the difference i
38 | n distance has a minimal impact on the seasons, as the tilt of the Earth is the
39 | dominant factor.
40 | The four seasons that occur on Earth are:
41 | 1. **Spring**: Typically begins around March 20/21 in the Northern Hemisphere an
42 | d September 22/23 in the Southern Hemisphere. During this season, the weather st
43 | arts to warm up, and days get longer.
44 | 2. **Summer**: Begins around June 20/21 in the Northern Hemisphere and December
45 | 21/22 in the Southern Hemisphere. This season is characterized by long days, war
46 | m temperatures, and often dry conditions.
47 | 3. **Autumn** (or **Fall**): Begins around September 22/23 in the Northern Hemis
48 | phere and March 20/21 in the Southern Hemisphere. As the days shorten, temperatu
49 | res cool, and leaves on trees change color before falling.
50 | 4. **Winter**: Begins around December 21/22 in the Northern Hemisphere and June
51 | 20/21 in the Southern Hemisphere. This season is marked by shorter days, colder
52 | temperatures, and often snow and ice.
53 | 
54 | The seasonal variations have a significant impact on the environment, ecosystems
55 | , and human societies. Understanding the causes of the seasons is essential for
56 | predicting weather patterns, managing resources, and planning activities.
57 | total time: 72.19s for 398 tokens, tok/s: 5.51
58 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/1_node/gen_f12/gen_qwen2.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_qwen2.5.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/qwen/qwen2.5-7B-Instruct/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/qwen/qwen2.5-7B-Instruct/config.json
 7 | torch float type: bf16
 8 | model weights converted to float12.
 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
10 | processor(s) core(s) used: 12 in 1 node(s).
11 | load: E:/qwen/qwen2.5-7B-Instruct/model-00001-of-00004.safetensors
12 | load: E:/qwen/qwen2.5-7B-Instruct/model-00002-of-00004.safetensors
13 | load: E:/qwen/qwen2.5-7B-Instruct/model-00003-of-00004.safetensors
14 | load: E:/qwen/qwen2.5-7B-Instruct/model-00004-of-00004.safetensors
15 | warning: tokenizer/transformer vocab_size missmatch (151665/152064)
16 | sampler config:
17 |   temperature      : 0.60
18 |   topp             : 0.65
19 |   topk             : 25
20 |   topp_minp        : 0.05
21 |   topp_eos         : true
22 |   repeat_penalty   : 0.00
23 |   repeat_penalty_n : 0
24 |   eos_amp          : 0.05
25 |   eos_amp_n        : 300
26 |   rand seed        : 1234
27 | Generate: max 32768 tokens..
28 | - Press 'esc' key to break generation.
29 | The explanation for the existence of seasons is that the Earth's axis is tilted
30 | at an angle of 23.5 degrees relative to its orbital plane. This tilt causes diff
31 | erent parts of the Earth to receive varying amounts of sunlight throughout the y
32 | ear, leading to the changing seasons. However, the Earth's axis is not fixed in
33 | space, but rather it precesses, meaning it traces out a circle in the sky over a
34 |  period of about 26,000 years. This precession affects the direction in which th
35 | e Earth's axis points, and consequently, the position of the solstices and equin
36 | oxes relative to the stars.
37 | 
38 | Given that the Earth's axis is tilted at 23.5 degrees, and it precesses over a p
39 | eriod of 26,000 years, calculate the average rate of precession in degrees per y
40 | ear. Additionally, determine the change in the position of the solstices relativ
41 | e to the stars over a period of 13,000 years. To determine the average rate of p
42 | recession in degrees per year, we start with the total precession period and the
43 |  total angle of precession. The Earth's axis precesses in a circle over a period
44 |  of 26,000 years, and the angle of precession is 360 degrees (since it completes
45 |  one full circle).
46 | 
47 | The average rate of precession in degrees per year is given by:
48 | \[
49 | \text{Average rate of precession} = \frac{360 \text{ degrees}}{26,000 \text{ yea
50 | rs}} = \frac{360}{26,000} \approx 0.013846 \text{ degrees per year}
51 | \]
52 | 
53 | Next, we need to determine the change in the position of the solstices relative
54 | to the stars over a period of 13,000 years. Since the Earth's axis precesses at
55 | a rate of approximately 0.013846 degrees per year, the change in the position of
56 |  the solstices over 13,000 years is:
57 | \[
58 | \text{Change in position} = 0.013846 \text{ degrees per year} \times 13,000 \tex
59 | t{ years} = 180.00 \text{ degrees}
60 | \]
61 | 
62 | Therefore, the change in the position of the solstices relative to the stars ove
63 | r a period of 13,000 years is:
64 | \[
65 | \boxed{180 \text{ degrees}}
66 | \]
67 | total time: 93.12s for 543 tokens, tok/s: 5.83
68 | Press any key to continue . . .


--------------------------------------------------------------------------------
/src/utils/l_util.h:
--------------------------------------------------------------------------------
  1 | // generic functions for llama api
  2 | 
  3 | #include <stdint.h>
  4 | #include <stdbool.h>
  5 | #include <string.h>
  6 | #include <setjmp.h>
  7 | 
  8 | #define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
  9 | #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
 10 | 
 11 | #define MBYTE (1024*1024)              // 1 megabyte
 12 | #define GBYTE (1024*1024*1024)         // 1 gigabyte
 13 | 
 14 | #define __no_return __declspec(noreturn)
 15 | 
 16 | // ------------------------------------
 17 | // debug
 18 | 
 19 | #if defined(_DEBUG) || defined(_CHECK)
 20 | void debug_break(void);
 21 | #define CHECK(a) ((a) ? (void)0 : debug_break())
 22 | #else
 23 | #define CHECK(a)
 24 | #endif
 25 | 
 26 | // ------------------------------------
 27 | // floats checks
 28 | bool check_no_nan_f32(const float *buff, size_t ne);
 29 | 
 30 | // ------------------------------------
 31 | // softmax
 32 | 
 33 | void softmax(float *x, int size);
 34 | 
 35 | // ------------------------------------
 36 | // information messages
 37 | 
 38 | void msg_info(const char *fmt, ...);
 39 | 
 40 | // print some space (align text usage)
 41 | void msg_spc(int n_spc);
 42 | 
 43 | // ------------------------------------
 44 | // errors
 45 | 
 46 | extern jmp_buf error_jmp;
 47 | 
 48 | #define APP_ERROR() setjmp(error_jmp)
 49 | 
 50 | // print error message and jump to error code
 51 | void __no_return msg_error(const char *fmt, ...);
 52 | 
 53 | // print assert error message and exit
 54 | void __no_return assert_exit(const char *fmt, ...);
 55 | 
 56 | // assert + position + exit
 57 | #define _ASSERT(x) if (!(x)) assert_exit("%s:%s:%d: %s\n", __FILE__, __FUNCTION__, __LINE__, #x)
 58 | 
 59 | // check range of an int32 value
 60 | void check_range_i(int a, const char *name, int min, int max);
 61 | 
 62 | // ------------------------------------
 63 | // rng
 64 | 
 65 | void rand_seed(int seed);
 66 | int rand_n(void);
 67 | float rand1(void);
 68 | float rand1s(void);
 69 | 
 70 | // ------------------------------------
 71 | // time
 72 | 
 73 | int time_in_ms(void);
 74 | 
 75 | // ------------------------------------
 76 | // files, manage big files and trap errors
 77 | 
 78 | typedef struct
 79 | {
 80 |   void *handle;
 81 |   const char *name;
 82 |   int64_t size;
 83 |   // user datas
 84 |   int64_t seek_ofs;                    // offset to add to f_seek 
 85 | } file_t;
 86 | 
 87 | #define f_SEEK_CUR 1
 88 | #define f_SEEK_END 2
 89 | #define f_SEEK_SET 0
 90 | 
 91 | void f_seek(file_t *h, int64_t ofs, int origin);
 92 | int64_t f_tell(file_t *h);
 93 | 
 94 | void f_open(file_t *h, const char *name, const char *mode);
 95 | void f_close(file_t *h);
 96 | void f_read(void *p, int64_t size, file_t *h);
 97 | void f_write(void *p, int64_t size, file_t *h);
 98 | 
 99 | // ------------------------------------
100 | // wait return pressed and exit
101 | 
102 | void wait_return_exit(void);
103 | 
104 | // ------------------------------------
105 | // display progress bar for long time operations
106 | 
107 | void progress_bar_init(bool new_line, int64_t max_value);
108 | void progress_bar_update(int64_t value);
109 | void progress_bar_done(void);
110 | 
111 | // ajust range of float x value in [x_min..x_max]
112 | void adjust_range_f32(float *x, const char *x_name, float x_min, float x_max);
113 | 
114 | // ajust range of int x value in [x_min..x_max]
115 | void adjust_range_int(int *x, const char *x_name, int x_min, int x_max);
116 | 


--------------------------------------------------------------------------------
/src/model/kv_cache.c:
--------------------------------------------------------------------------------
  1 | // kv cache: is part of transformer.c but moved in separate file for clarity.
  2 | // this code use a method to ensure kv cache is never full. (experimental.. is that good ?)
  3 | // the method consists in 'forgetting' the oldest tokens.
  4 | // in chat mode:
  5 | //  - series of one user entry + one llm reply are deleted after system prompt
  6 | // in generate mode:
  7 | //  - only delete some first tokens.
  8 | // the hole produced after systemp promt is removed and rope for kv datas following hole is updated as if tokens follow prompt without hole.
  9 | // todo: difficult to test effect as require context almost full to operate.
 10 | 
 11 | #ifdef PACK_KV_CACHE
 12 | 
 13 | #include "l_util.h"
 14 | #include "model.h"
 15 | 
 16 | // 'forget' some tokens in kv cache to reduce context size.
 17 | static void reduce_kv_cache(int min_tokens_delete)
 18 | {
 19 |   struct transformer_t *t = &model.transformer;
 20 |   const struct transformer_config_t *p = &t->config;
 21 |   struct transformer_runstate_t *s = &t->state;
 22 | 
 23 |   int n_ctx = s->cache.n_tokens;       // current tokens count in context
 24 |   int min_del = n_ctx/20;              // min tokens to delete (5% of context)
 25 |   int i0, i, n_del;
 26 | 
 27 |   if (model.config.run_mode == 0)
 28 |   {
 29 |     // generate mode, very unlikely to happen (no eot produced before context full)
 30 |     // done to ensure no cache overflow.
 31 |     i0 = 0;
 32 |     i = min_del;
 33 |   }
 34 |   else                                 // chat mode
 35 |   {
 36 |     if (min_tokens_delete < min_del)
 37 |       min_tokens_delete = min_del;
 38 | 
 39 |     i0 = t->state.cache.n_tokens_sys;  // keep sys prompt if defined
 40 |     for (i=i0; i<n_ctx; )
 41 |     {
 42 |       // pass one user entry
 43 |       for (; i<n_ctx; i++)
 44 |         if (t->state.cache.tokens[i].sampled)
 45 |           break;
 46 | 
 47 |       // pass one llm reply
 48 |       for (; i<n_ctx; i++)
 49 |         if (!t->state.cache.tokens[i].sampled)
 50 |           break;
 51 | 
 52 |       // test if enough deleted
 53 |       if ((i - i0) >= min_tokens_delete)
 54 |         break;
 55 |     }
 56 |   }
 57 | 
 58 |   // count of deleted tokens in cache
 59 |   n_del = i - i0;
 60 | 
 61 |   // update user info
 62 |   t->state.cache.n_tokens_del += n_del;
 63 | 
 64 | #if 0
 65 |   // debug: display deleted token list
 66 |   {
 67 |     int j;
 68 |     msg_info("\n------\n kv delete %d tokens:\n", n_del);
 69 |     for (j=i0; j<i; j++)
 70 |     {
 71 |       tokenizer_decode_print(s->cache.tokens[j].token_id, true);
 72 |       msg_info(",");
 73 |     }
 74 |     msg_info("\n------\n");
 75 |   }
 76 | #endif
 77 | 
 78 |   // define < 0 rope rotation = - num of deleted tokens
 79 |   set_RoPE_pos(s->rope_sin_cos, -n_del, s->rope_freq, p->head_size/2);
 80 | 
 81 |   // compact and update kv cache rope
 82 |   s->cache.n_tokens = i0;
 83 |   s->cache.n_tokens_samp = 0;
 84 | 
 85 |   for (; i<n_ctx; i++)
 86 |   {
 87 |     int l, pos = s->cache.n_tokens++;
 88 | 
 89 |     // remove kv cache hole
 90 |     for (l=0; l<p->n_layers; l++)
 91 |     {
 92 |       size_t i_ofs = ((size_t)l * n_ctx + i  ) * p->kv_dim;
 93 |       size_t p_ofs = ((size_t)l * n_ctx + pos) * p->kv_dim;
 94 |       RoPE(&s->k_cache[i_ofs], &s->v_cache[i_ofs], s->rope_sin_cos, p->head_size, p->kv_dim, p->kv_dim);
 95 |       memcpy(&s->k_cache[p_ofs], &s->k_cache[i_ofs], p->kv_dim * sizeof(float));
 96 |       memcpy(&s->v_cache[p_ofs], &s->v_cache[i_ofs], p->kv_dim * sizeof(float));
 97 |     }
 98 | 
 99 |     // compact token list
100 |     s->cache.tokens[pos] = s->cache.tokens[i];
101 |     if (s->cache.tokens[pos].sampled)
102 |       s->cache.n_tokens_samp++;
103 |     else
104 |       s->cache.n_tokens_samp = 0;
105 |   }
106 | }
107 | 
108 | // reserve tokens in kv cache for llm generation.
109 | // return cound of deleted tokens
110 | int reserve_kv_cache(int min_token_reserve)
111 | {
112 |   int token_prev = model.transformer.state.cache.n_tokens;
113 |   int token_left = model.transformer.config.seq_len - token_prev;
114 |   if (token_left < min_token_reserve)
115 |   {
116 |     reduce_kv_cache(min_token_reserve - token_left);
117 |     return token_prev - model.transformer.state.cache.n_tokens;  // return num of deleted tokens
118 |   }
119 |   return 0;
120 | }
121 | 
122 | #endif // PACK_KV_CACHE


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_mixtral_f8.txt:
--------------------------------------------------------------------------------
 1 | read file run_json/run_mixtral.json
 2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 3 | conv/matmul AVX2 checks done.
 4 | load tokenizer: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/tokenizer.json
 5 | load transformer..
 6 | read model config in: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/config.json
 7 | torch float type: bf16
 8 | model weights converted to float8.
 9 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
10 | processor(s) core(s) used: 12 in 1 node(s).
11 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00001-of-00019.safetensors
12 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00002-of-00019.safetensors
13 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00003-of-00019.safetensors
14 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00004-of-00019.safetensors
15 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00005-of-00019.safetensors
16 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00006-of-00019.safetensors
17 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00007-of-00019.safetensors
18 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00008-of-00019.safetensors
19 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00009-of-00019.safetensors
20 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00010-of-00019.safetensors
21 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00011-of-00019.safetensors
22 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00012-of-00019.safetensors
23 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00013-of-00019.safetensors
24 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00014-of-00019.safetensors
25 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00015-of-00019.safetensors
26 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00016-of-00019.safetensors
27 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00017-of-00019.safetensors
28 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00018-of-00019.safetensors
29 | load: E:/mixtral/Mixtral-8x7B-Instruct-v0.1/model-00019-of-00019.safetensors
30 | sampler config:
31 |   temperature      : 0.70
32 |   topp             : 0.80
33 |   topk             : 40
34 |   topp_minp        : 0.05
35 |   topp_eos         : true
36 |   repeat_penalty   : 0.05
37 |   repeat_penalty_n : 50
38 |   eos_amp          : 0.00
39 |   eos_amp_n        : 150
40 |   rand seed        : 1234
41 | Generate: max 32768 tokens..
42 | - Press 'esc' key to break generation.
43 | The explanation for the existence of seasons is not as simple as you might think
44 | . The tilt of the Earth's axis and its orbit around the sun combine to create th
45 | e seasons, but the tilt itself is the most important factor. The Earth's axis is
46 |  tilted at an angle of 23.5 degrees from the vertical, and this tilt is what cau
47 | ses the sun's rays to hit different parts of the Earth at different angles durin
48 | g different times of the year.
49 | 
50 | During the summer, the Earth's tilt causes the sun's rays to hit the Earth at a
51 | more direct angle, resulting in more intense heat and longer days. In the winter
52 | , the tilt causes the sun's rays to hit the Earth at a more oblique angle, resul
53 | ting in less intense heat and shorter days. The tilt also causes the sun to appe
54 | ar higher in the sky during the summer and lower in the sky during the winter.
55 | 
56 | The Earth's orbit around the sun also plays a role in the creation of seasons, b
57 | ut it is less important than the tilt of the Earth's axis. The Earth's orbit is
58 | not a perfect circle, but rather an ellipse. This means that the Earth is closer
59 |  to the sun at certain times of the year and farther away at other times. Howeve
60 | r, the difference in distance between the Earth and the sun is not enough to sig
61 | nificantly affect the intensity of the sun's rays or the length of the days.
62 | 
63 | In summary, the tilt of the Earth's axis is the primary factor responsible for t
64 | he creation of seasons. The Earth's orbit around the sun also plays a role, but
65 | it is less important than the tilt. The tilt causes the sun's rays to hit differ
66 | ent parts of the Earth at different angles, resulting in the varying temperature
67 | s and day lengths that we associate with the seasons.
68 | total time: 79.28s for 384 tokens, tok/s: 4.84
69 | Press any key to continue . . .


--------------------------------------------------------------------------------
/src/utils/mem_alloc.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include "l_util.h"
  3 | #include "mem_alloc.h"
  4 | 
  5 | #define ALLOC_SZ_ALIGN (256/8)   // align for AVX
  6 | 
  7 | // ------------------------------------
  8 | // memory allocation + check
  9 | 
 10 | #if !defined(_DEBUG) && !defined(CHECK_ALLOC)
 11 | 
 12 | void *malloc_check(size_t size)
 13 | {
 14 |   void *p = _aligned_malloc(size, ALLOC_SZ_ALIGN);
 15 |   if (!p)
 16 |     msg_error("malloc failed to alloc %d bytes\n", size);
 17 |   return p;
 18 | }
 19 | 
 20 | void *calloc_check(size_t size)
 21 | {
 22 |   void *p = _aligned_malloc(size, ALLOC_SZ_ALIGN);
 23 |   if (!p)
 24 |     msg_error("calloc failed to alloc %d bytes\n", size);
 25 |   memset(p, 0, size);
 26 |   return p;
 27 | }
 28 | 
 29 | void *realloc_check(void *ptr, size_t size)
 30 | {
 31 |   void *p = _aligned_realloc(ptr, size, ALLOC_SZ_ALIGN);
 32 |   if (!p)
 33 |     msg_error("realloc failed to alloc %d bytes\n", size);
 34 |   return p;
 35 | }
 36 | 
 37 | void free_check(void *ptr)
 38 | {
 39 |   if (ptr)
 40 |     _aligned_free(ptr);
 41 | }
 42 | 
 43 | void dbg_print_alloc(void)
 44 | {
 45 |   // not checked in release
 46 | }
 47 | 
 48 | #else
 49 | 
 50 | // debug malloc + allocated size infos
 51 | 
 52 | // stats
 53 | struct
 54 | {
 55 |   size_t size_alloc_sum;              // current allocated size
 56 |   size_t size_alloc_sum_max;          // max reached allocated size
 57 |   size_t size_block_max;              // max allocated block size
 58 |   int n_header;                        // current header in use
 59 |   int n_malloc;                        // sum count
 60 |   int n_realloc;                       // sum count
 61 | } a_inf = { 0 };
 62 | 
 63 | // alloc header, use size that keep alignment for sse
 64 | typedef union
 65 | {
 66 |   size_t sz;
 67 |   char mem[ALLOC_SZ_ALIGN];
 68 | } a_hdr;
 69 | 
 70 | // return aligned address
 71 | static void *mem_align(a_hdr *h, size_t alloc_sz)
 72 | {
 73 |   void *p;
 74 |   if (alloc_sz > a_inf.size_block_max)
 75 |     a_inf.size_block_max = alloc_sz;
 76 | 
 77 |   if (a_inf.size_alloc_sum > a_inf.size_alloc_sum_max)
 78 |     a_inf.size_alloc_sum_max = a_inf.size_alloc_sum;
 79 | 
 80 |   h->sz = alloc_sz;
 81 |   p = h + 1;
 82 |   CHECK(((size_t)p & (ALLOC_SZ_ALIGN-1)) == 0);
 83 |   return p;
 84 | }
 85 | 
 86 | void *malloc_check(size_t size)
 87 | {
 88 |   a_hdr *h = (a_hdr *)_aligned_malloc(size + sizeof(a_hdr), ALLOC_SZ_ALIGN);
 89 |   if (!h)
 90 |     msg_error("malloc failed to alloc %d bytes\n", size);
 91 | 
 92 |   memset(h, 0, sizeof(a_hdr));         // 0 unused bytes
 93 |   a_inf.size_alloc_sum += size;
 94 |   a_inf.n_header++;
 95 |   a_inf.n_malloc++;
 96 |   return mem_align(h, size);
 97 | }
 98 | 
 99 | void *calloc_check(size_t size)
100 | {
101 |   void *p = malloc_check(size);
102 |   memset(p, 0, size);
103 |   return p;
104 | }
105 | 
106 | void *realloc_check(void *ptr, size_t size)
107 | {
108 |   if (ptr)
109 |   {
110 |     a_hdr *h = (a_hdr *)ptr - 1;
111 |     a_inf.size_alloc_sum -= h->sz;
112 |     CHECK(a_inf.size_alloc_sum >= 0);
113 |     h = (a_hdr *)_aligned_realloc(h, size + sizeof(a_hdr), ALLOC_SZ_ALIGN);
114 |     if (!h)
115 |       msg_error("realloc failed to alloc %d bytes\n", size);
116 | 
117 |     a_inf.size_alloc_sum += size;
118 |     a_inf.n_realloc++;
119 |     return mem_align(h, size);
120 |   }
121 |   return malloc_check(size);
122 | }
123 | 
124 | void free_check(void *ptr)
125 | {
126 |   if (ptr)
127 |   {
128 |     a_hdr *h = (a_hdr *)ptr - 1;
129 |     a_inf.size_alloc_sum -= h->sz;
130 |     a_inf.n_header--;
131 |     CHECK(a_inf.size_alloc_sum >= 0);
132 |     CHECK(a_inf.n_header >= 0);
133 |     _aligned_free(h);
134 |   }
135 | }
136 | 
137 | // debug info
138 | void dbg_print_alloc(void)
139 | {
140 |   msg_info("INFO mem alloc:\n");
141 |   msg_info(" size_alloc_sum   %.6f Mb\n", (double)a_inf.size_alloc_sum / (1024*1024));
142 |   msg_info(" size_block_max;  %.6f Mb\n", (double)a_inf.size_block_max / (1024*1024));
143 |   msg_info(" n_header   %d\n", a_inf.n_header); 
144 |   msg_info(" n_malloc   %d\n", a_inf.n_malloc);
145 |   msg_info(" n_realloc  %d\n", a_inf.n_realloc);
146 |   msg_info(" size_alloc_sum_max %.6f Mb\n", (double)a_inf.size_alloc_sum_max / (1024*1024));
147 |   if (a_inf.size_alloc_sum || a_inf.n_header)
148 |     msg_info(" >some memory is still allocated: %u bytes\n", (int)a_inf.size_alloc_sum);
149 |   else
150 |     msg_info(" >all memory has been freed.\n");
151 | }
152 | 
153 | #endif
154 | 
155 | // alloc string
156 | char *str_alloc(const char *str, int len)
157 | {
158 |   char *s;
159 |   CHECK(len >= 0);
160 |   s = malloc_check(len+1);
161 |   memcpy(s, str, len);
162 |   s[len] = 0;
163 |   return s;
164 | }
165 | 


--------------------------------------------------------------------------------
/src/model/model.h:
--------------------------------------------------------------------------------
  1 | // user application header
  2 | #include "transformer.h"
  3 | #include "tokenizer.h"
  4 | #include "sampler.h"
  5 | 
  6 | // model identifier
  7 | enum e_model_id
  8 | {
  9 |   model_id_tinyllama = 0,    // "tinyllama",
 10 |   model_id_llama1,           // "llama1",
 11 |   model_id_llama2,           // "llama2",
 12 |   model_id_code_llama,       // "codellama",
 13 |   model_id_llama3,           // "llama3",
 14 |   model_id_llama31,          // "llama31",
 15 |   model_id_mistral,          // "mistral",
 16 |   model_id_mathstral,        // "mathstral",
 17 |   model_id_zephyr,           // "zephyr",
 18 |   model_id_mixtral,          // "mixtral",
 19 |   model_id_vigogne2,         // "vigogne2",
 20 |   model_id_qwen2,            // "qwen2",
 21 |   model_id_count,            // models count
 22 | };
 23 | 
 24 | extern const char *model_id_names[model_id_count];
 25 | 
 26 | // application run mode
 27 | enum e_run_mode
 28 | {
 29 |   run_mode_generate = 0,
 30 |   run_mode_chat,
 31 | };
 32 | 
 33 | // chat mode config
 34 | struct chat_cfg_t
 35 | {
 36 |   bool chat_use_colors;             // use colors for user/assistant text
 37 | 
 38 |   // forward tokens display options
 39 |   int fwd_disp_mode;                // 0: display nothing, 1: tokens list
 40 | 
 41 |   // method used to generate the chat prompt format
 42 |   int chat_prompt_mode;
 43 | 
 44 |   // prompt names displayed for assistant and user
 45 |   char *chat_assistant_name;
 46 |   char *chat_user_name;
 47 | 
 48 |   // mode 0
 49 |   char *cm0_sys_prompt;
 50 |   char *cm0_user_prompt;
 51 | 
 52 |   // mode 1
 53 |   char *cm1_sys_template;
 54 |   char *cm1_user_first_template;
 55 |   char *cm1_user_template;
 56 |   char *cm1_end_template;
 57 |   char *cm1_sys_prompt;
 58 |   char *cm1_user_prompt;
 59 | 
 60 |   // mode 2
 61 |   char *cm2_sys_template;
 62 |   char *cm2_user_template;
 63 |   char *cm2_user_name_sw;           // swith user/assistant string in generate mode
 64 |   char *cm2_sys_prompt;
 65 |   char *cm2_user_prompt;
 66 | };
 67 | 
 68 | // run configuration defined in json
 69 | struct run_conf_t
 70 | {
 71 |   // model identifier
 72 |   char *model_ident;               // define model type for model specificities
 73 | 
 74 |   // model load
 75 |   struct
 76 |   {
 77 |     int model_num_safetensors;     // count of .safetensors files in model
 78 |     char *model_path;              // path to model, ex "C:/llama2/llama2-7b-chat-hf"
 79 |     char *tokenizer_name;          // tokenizer file name (ex tokenizer.json)
 80 |   } load;
 81 | 
 82 |   // set or override rope freq
 83 |   float rope_set;                  // set/change rope inv freq value, ignored if 0
 84 | 
 85 |   // sampler config defined in sampler struct
 86 | 
 87 |   // load parameters
 88 |   bool cvt_sf16;                   // convert model to sfloat16 at load
 89 |   bool cvt_f12;                    // convert model to float12 at load
 90 |   bool cvt_f8;                     // convert model to float8 at load
 91 | 
 92 |   // hardware parameters
 93 |   int num_procs;                   // num procs used for threads
 94 |   int numa_nodes;                  // num numa nodes to init
 95 |   int simd_mode;                   // -1: best auto, 0:off(fpu) 1:sse 2:avx
 96 | 
 97 |   // checks
 98 |   bool test_nan_logits;            // test for NAN at sampling in all logits results
 99 | 
100 |   // run mode
101 |   enum e_run_mode run_mode;        // 0: generate, 1:chat
102 |   int gen_run_steps;               // number of steps to run. 0 = max (model max_seq_len)
103 |   char *token_eos_str;             // end of string token (assistant reply end)
104 |   char *token_eot_str;             // end of text token (dialog/generate end)
105 | 
106 |   // token display option
107 |   bool tok_disp_raw;
108 |   bool tok_disp_split;             // separate each token with ','
109 |   bool tok_disp_prob;              // display sampling information
110 |  
111 |   // generate mode config
112 |   char *gen_mode_prompt;           // init prompt for generate run_mode
113 | 
114 |   // chat mode config
115 |   struct chat_cfg_t chat;
116 | 
117 |   // defined using strings
118 |   enum e_model_id e_model_id;
119 |   int token_eos;                   // eos token
120 |   int token_eot;                   // eot token
121 | };
122 | 
123 | struct model_t
124 | {
125 |   struct run_conf_t config;
126 |   struct tokenizer_t tokenizer;
127 |   struct transformer_t transformer;
128 |   struct sampler_t sampler;
129 | };
130 | 
131 | extern struct model_t model;
132 | 
133 | void build_model(const char *conf_file_name);
134 | 
135 | void free_model(void);
136 | 
137 | // chat loop
138 | void chat(void);
139 | 
140 | // generation loop
141 | void generate(void);
142 | 


--------------------------------------------------------------------------------
/tests/1_node/gen_ref/gen_qwen2.txt:
--------------------------------------------------------------------------------
  1 | read file run_json/run_qwen2.5.json
  2 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
  3 | conv/matmul AVX2 checks done.
  4 | load tokenizer: E:/qwen/qwen2.5-7B-Instruct/tokenizer.json
  5 | load transformer..
  6 | read model config in: E:/qwen/qwen2.5-7B-Instruct/config.json
  7 | torch float type: bf16
  8 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
  9 | processor(s) core(s) used: 12 in 1 node(s).
 10 | load: E:/qwen/qwen2.5-7B-Instruct/model-00001-of-00004.safetensors
 11 | load: E:/qwen/qwen2.5-7B-Instruct/model-00002-of-00004.safetensors
 12 | load: E:/qwen/qwen2.5-7B-Instruct/model-00003-of-00004.safetensors
 13 | load: E:/qwen/qwen2.5-7B-Instruct/model-00004-of-00004.safetensors
 14 | warning: tokenizer/transformer vocab_size missmatch (151665/152064)
 15 | sampler config:
 16 |   temperature      : 0.60
 17 |   topp             : 0.65
 18 |   topk             : 25
 19 |   topp_minp        : 0.05
 20 |   topp_eos         : true
 21 |   repeat_penalty   : 0.00
 22 |   repeat_penalty_n : 0
 23 |   eos_amp          : 0.05
 24 |   eos_amp_n        : 300
 25 |   rand seed        : 1234
 26 | Generate: max 32768 tokens..
 27 | - Press 'esc' key to break generation.
 28 | The explanation for the existence of seasons is that the Earth's axis is tilted
 29 | at an angle of 23.5 degrees relative to its orbital plane. This tilt causes diff
 30 | erent parts of the Earth to receive varying amounts of sunlight throughout the y
 31 | ear, leading to the changing seasons. However, the Earth's axis is not fixed in
 32 | space, but rather it precesses, meaning it traces out a circle in the sky over a
 33 |  period of about 26,000 years. This precession affects the direction in which th
 34 | e Earth's axis points, and consequently, the position of the solstices and equin
 35 | oxes relative to the stars.
 36 | 
 37 | Given that the Earth's axis is tilted at 23.5 degrees, and it precesses over a p
 38 | eriod of 26,000 years, calculate the average rate of precession in degrees per y
 39 | ear. Additionally, determine the change in the position of the solstices relativ
 40 | e to the stars over a period of 13,000 years. To determine the average rate of p
 41 | recession in degrees per year, we start with the total precession period and the
 42 |  total angle of precession. The Earth's axis precesses in a circle over a period
 43 |  of 26,000 years, and the angle of precession is 360 degrees (since it completes
 44 |  one full circle).
 45 | 
 46 | The average rate of precession in degrees per year is given by:
 47 | \[
 48 | \text{Average rate of precession} = \frac{360 \text{ degrees}}{26,000 \text{ yea
 49 | rs}} = \frac{360}{26,000} \approx 0.013846 \text{ degrees per year}
 50 | \]
 51 | 
 52 | Next, we need to determine the change in the position of the solstices relative
 53 | to the stars over a period of 13,000 years. Since the Earth's axis precesses at
 54 | a rate of approximately 0.013846 degrees per year, the change in the position of
 55 |  the solstices over 13,000 years is:
 56 | \[
 57 | \text{Change in position} = 0.013846 \text{ degrees per year} \times 13,000 \tex
 58 | t{ years} = 180.00 \text{ degrees}
 59 | \]
 60 | 
 61 | Therefore, the change in the position of the solstices relative to the stars ove
 62 | r a period of 13,000 years is:
 63 | \[
 64 | \boxed{180 \text{ degrees}}
 65 | \]
 66 | total time: 114.35s for 543 tokens, tok/s: 4.75
 67 | Press any key to continue . . .
 68 | 
 69 | 
 70 | 
 71 | 
 72 | read file run_json/run_qwen2.5.json
 73 | CPU flags: f16c:1 fma3:1, sse4.2:1 avx:1 avx2:1
 74 | conv/matmul AVX2 checks done.
 75 | load tokenizer: E:/qwen/qwen2.5-0.5B/tokenizer.json
 76 | load transformer..
 77 | read model config in: E:/qwen/qwen2.5-0.5B/config.json
 78 | torch float type: bf16
 79 | numa node(s): 1, mp node: 0, num logical/physical procs.: 14/14 (HT off)
 80 | processor(s) core(s) used: 12 in 1 node(s).
 81 | load: E:/qwen/qwen2.5-0.5B/model.safetensors
 82 | info: classifier use embed_tokens.weight.
 83 | warning: tokenizer/transformer vocab_size missmatch (151665/151936)
 84 | sampler config:
 85 |   temperature      : 0.60
 86 |   topp             : 0.65
 87 |   topk             : 25
 88 |   topp_minp        : 0.05
 89 |   topp_eos         : true
 90 |   repeat_penalty   : 0.00
 91 |   repeat_penalty_n : 0
 92 |   eos_amp          : 0.05
 93 |   eos_amp_n        : 300
 94 |   rand seed        : 1234
 95 | Generate: max 32768 tokens..
 96 | - Press 'esc' key to break generation.
 97 | The explanation for the existence of seasons is that the Earth's axis is tilted
 98 | at an angle of approximately 23.5 degrees relative to its orbit around the Sun.
 99 | This tilt causes the same amount of sunlight to fall on the northern and souther
100 | n hemispheres at different times of the year. This is why the seasons are observ
101 | ed.
102 | total time: 1.22s for 67 tokens, tok/s: 55.05
103 | Press any key to continue . . .


--------------------------------------------------------------------------------
/tests/2_nodes/res_2sockets.txt:
--------------------------------------------------------------------------------
  1 | LLama2 7B 
  2 | C:\dev_c\llama_st>llama_stw run_json/run_llama2.json
  3 | read model config in: D:/llama2/llama2-7b-chat-hf/config.json
  4 | 
  5 | numa node(s): 2, mp node: 1, num logical/physical procs.: 48/24 (HT on)
  6 | 
  7 | processor(s) used: 24 in 2 node(s).
  8 | total time: 28.00s for 249 tokens, tok/s: 8.89
  9 | 
 10 | processor(s) used: 22 in 2 node(s).
 11 | total time: 27.49s for 249 tokens, tok/s: 9.06
 12 | 
 13 | processor(s) used: 20 in 2 node(s).
 14 | total time: 27.92s for 249 tokens, tok/s: 8.92
 15 | 
 16 | processor(s) used: 18 in 2 node(s).
 17 | total time: 28.55s for 249 tokens, tok/s: 8.72
 18 | 
 19 | processor(s) used: 16 in 2 node(s).
 20 | total time: 29.76s for 249 tokens, tok/s: 8.37
 21 | 
 22 | 
 23 | model weights converted to float12.
 24 | 
 25 | processor(s) used: 24 in 2 node(s).
 26 | total time: 22.87s for 249 tokens, tok/s: 10.89
 27 | 
 28 | processor(s) used: 22 in 2 node(s).
 29 | total time: 24.57s for 249 tokens, tok/s: 10.13
 30 | 
 31 | processor(s) used: 20 in 2 node(s).
 32 | total time: 26.27s for 249 tokens, tok/s: 9.48
 33 | 
 34 | processor(s) used: 18 in 2 node(s).
 35 | total time: 28.89s for 249 tokens, tok/s: 8.62
 36 | 
 37 | processor(s) used: 16 in 2 node(s).
 38 | total time: 32.35s for 249 tokens, tok/s: 7.70
 39 | 
 40 | 
 41 | // --------------------------------------
 42 | 
 43 | read model config in: D:/qwen/qwen2.5-7B-Instruct/config.json
 44 | 
 45 | processor(s) used: 24 in 2 node(s).
 46 | total time: 63.40s for 543 tokens, tok/s: 8.56
 47 | 
 48 | processor(s) used: 22 in 2 node(s).
 49 | total time: 63.74s for 543 tokens, tok/s: 8.52
 50 | 
 51 | processor(s) used: 20 in 2 node(s).
 52 | total time: 64.12s for 543 tokens, tok/s: 8.47
 53 | 
 54 | processor(s) used: 18 in 2 node(s).
 55 | total time: 66.64s for 543 tokens, tok/s: 8.15
 56 | 
 57 | processor(s) used: 16 in 2 node(s).
 58 | total time: 69.56s for 543 tokens, tok/s: 7.81
 59 | 
 60 | f12
 61 | 
 62 | processor(s) used: 24 in 2 node(s).
 63 | total time: 52.88s for 543 tokens, tok/s: 10.27
 64 | 
 65 | processor(s) used: 22 in 2 node(s).
 66 | total time: 59.48s for 543 tokens, tok/s: 9.13
 67 | 
 68 | processor(s) used: 20 in 2 node(s).
 69 | total time: 65.05s for 543 tokens, tok/s: 8.35
 70 | 
 71 | 
 72 | 
 73 | load tokenizer: D:/qwen/qwen2.5-72b-intruct/tokenizer.json
 74 | model weights converted to float12.
 75 | 
 76 | processor(s) used: 24 in 2 node(s).
 77 | total time: 924.74s for 480 tokens, tok/s: 0.52
 78 | 
 79 | 
 80 | 
 81 | 
 82 | load tokenizer: D:/mixtral/Mixtral-8x7B-Instruct-v0.1/tokenizer.json
 83 | torch float type: bf16
 84 | 
 85 | processor(s) core(s) used: 12 in 1 node(s). (HT off)
 86 | total time: 174.74s for 282 tokens, tok/s: 1.61
 87 | 
 88 | processor(s) core(s) used: 12 in 2 node(s).
 89 | total time: 111.21s for 282 tokens, tok/s: 2.54
 90 | 
 91 | processor(s) core(s) used: 24 in 2 node(s).
 92 | total time: 136.61s for 282 tokens, tok/s: 2.06  (slown down occured)
 93 | 
 94 | processor(s) core(s) used: 22 in 2 node(s).
 95 | total time: 99.81s for 282 tokens, tok/s: 2.83
 96 | 
 97 | processor(s) core(s) used: 20 in 2 node(s).
 98 | total time: 108.33s for 282 tokens, tok/s: 2.60
 99 | 
100 | model weights converted to float12.
101 | 
102 | processor(s) core(s) used: 10 in 1 node(s). (HT off)
103 | total time: 208.03s for 432 tokens, tok/s: 2.08
104 | 
105 | processor(s) core(s) used: 10 in 2 node(s).
106 | total time: 182.43s for 432 tokens, tok/s: 2.37
107 | 
108 | processor(s) core(s) used: 24 in 2 node(s).
109 | total time: 181.87s for 432 tokens, tok/s: 2.38
110 | 
111 | processor(s) core(s) used: 22 in 2 node(s).
112 | total time: 137.13s for 432 tokens, tok/s: 3.15
113 | 
114 | processor(s) core(s) used: 20 in 2 node(s).
115 | total time: 122.16s for 432 tokens, tok/s: 3.54
116 | 
117 | processor(s) core(s) used: 18 in 2 node(s).
118 | total time: 144.04s for 432 tokens, tok/s: 3.00
119 | 
120 | HT on
121 | 
122 | torch float type: bf16
123 | processor(s) core(s) used: 24 in 2 node(s).
124 | total time: 83.26s for 282 tokens, tok/s: 3.39
125 | 
126 | processor(s) core(s) used: 22 in 2 node(s).
127 | total time: 98.09s for 282 tokens, tok/s: 2.87
128 | 
129 | model weights converted to float12.
130 | processor(s) core(s) used: 24 in 2 node(s).
131 | total time: 134.55s for 432 tokens, tok/s: 3.21
132 | 
133 | processor(s) core(s) used: 22 in 2 node(s).
134 | total time: 132.07s for 432 tokens, tok/s: 3.27
135 | 
136 | processor(s) core(s) used: 20 in 2 node(s).
137 | total time: 139.39s for 432 tokens, tok/s: 3.10
138 | 
139 | 
140 | torch float type: bf16
141 | processor(s) used: 24 in 2 node(s).
142 | node 0 procs: 0,2,4,6,8,10,12,14,16,18,20,22,
143 | node 1 procs: 24,26,28,30,32,34,36,38,40,42,44,46,
144 | total time: 114.35s for 282 tokens, tok/s: 2.47
145 | 
146 | processor(s) used: 22 in 2 node(s).
147 | total time: 95.71s for 282 tokens, tok/s: 2.95
148 | 
149 | model weights converted to float12.
150 | numa node(s): 2, mp node: 1, num logical/physical procs.: 48/24 (HT on)
151 | processor(s) used: 22 in 2 node(s).
152 | total time: 125.69s for 432 tokens, tok/s: 3.44
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/tests/2_nodes/llama2_ht_off.txt:
--------------------------------------------------------------------------------
  1 | LLama2 7B 
  2 | C:\dev_c\llama_st>llama_stw run_json/run_llama2.json
  3 | read model config in: D:/llama2/llama2-7b-chat-hf/config.json
  4 | 
  5 | numa node(s): 2, mp node: 1, num logical/physical procs.: 48/24 (HT on)
  6 | 
  7 | processor(s) used: 24 in 2 node(s).
  8 | total time: 28.00s for 249 tokens, tok/s: 8.89
  9 | 
 10 | processor(s) used: 22 in 2 node(s).
 11 | total time: 27.49s for 249 tokens, tok/s: 9.06
 12 | 
 13 | processor(s) used: 20 in 2 node(s).
 14 | total time: 27.92s for 249 tokens, tok/s: 8.92
 15 | 
 16 | processor(s) used: 18 in 2 node(s).
 17 | total time: 28.55s for 249 tokens, tok/s: 8.72
 18 | 
 19 | processor(s) used: 16 in 2 node(s).
 20 | total time: 29.76s for 249 tokens, tok/s: 8.37
 21 | 
 22 | 
 23 | model weights converted to float12.
 24 | 
 25 | processor(s) used: 24 in 2 node(s).
 26 | total time: 22.87s for 249 tokens, tok/s: 10.89
 27 | 
 28 | processor(s) used: 22 in 2 node(s).
 29 | total time: 24.57s for 249 tokens, tok/s: 10.13
 30 | 
 31 | processor(s) used: 20 in 2 node(s).
 32 | total time: 26.27s for 249 tokens, tok/s: 9.48
 33 | 
 34 | processor(s) used: 18 in 2 node(s).
 35 | total time: 28.89s for 249 tokens, tok/s: 8.62
 36 | 
 37 | processor(s) used: 16 in 2 node(s).
 38 | total time: 32.35s for 249 tokens, tok/s: 7.70
 39 | 
 40 | 
 41 | // --------------------------------------
 42 | 
 43 | read model config in: D:/qwen/qwen2.5-7B-Instruct/config.json
 44 | 
 45 | processor(s) used: 24 in 2 node(s).
 46 | total time: 63.40s for 543 tokens, tok/s: 8.56
 47 | 
 48 | processor(s) used: 22 in 2 node(s).
 49 | total time: 63.74s for 543 tokens, tok/s: 8.52
 50 | 
 51 | processor(s) used: 20 in 2 node(s).
 52 | total time: 64.12s for 543 tokens, tok/s: 8.47
 53 | 
 54 | processor(s) used: 18 in 2 node(s).
 55 | total time: 66.64s for 543 tokens, tok/s: 8.15
 56 | 
 57 | processor(s) used: 16 in 2 node(s).
 58 | total time: 69.56s for 543 tokens, tok/s: 7.81
 59 | 
 60 | f12
 61 | 
 62 | processor(s) used: 24 in 2 node(s).
 63 | total time: 52.88s for 543 tokens, tok/s: 10.27
 64 | 
 65 | processor(s) used: 22 in 2 node(s).
 66 | total time: 59.48s for 543 tokens, tok/s: 9.13
 67 | 
 68 | processor(s) used: 20 in 2 node(s).
 69 | total time: 65.05s for 543 tokens, tok/s: 8.35
 70 | 
 71 | C:\dev_c\llama_st>
 72 | 
 73 | 
 74 | load tokenizer: D:/qwen/qwen2.5-72b-intruct/tokenizer.json
 75 | model weights converted to float12.
 76 | processor(s) used: 24 in 2 node(s).
 77 | total time: 924.74s for 480 tokens, tok/s: 0.52
 78 | 
 79 | 
 80 | 
 81 | 
 82 | load tokenizer: D:/mixtral/Mixtral-8x7B-Instruct-v0.1/tokenizer.json
 83 | torch float type: bf16
 84 | processor(s) core(s) used: 12 in 1 node(s). (HT off)
 85 | total time: 174.74s for 282 tokens, tok/s: 1.61
 86 | 
 87 | processor(s) core(s) used: 12 in 2 node(s).
 88 | total time: 111.21s for 282 tokens, tok/s: 2.54
 89 | 
 90 | processor(s) core(s) used: 24 in 2 node(s).
 91 | total time: 136.61s for 282 tokens, tok/s: 2.06
 92 | 
 93 | processor(s) core(s) used: 22 in 2 node(s).
 94 | total time: 99.81s for 282 tokens, tok/s: 2.83
 95 | 
 96 | processor(s) core(s) used: 20 in 2 node(s).
 97 | total time: 108.33s for 282 tokens, tok/s: 2.60
 98 | 
 99 | model weights converted to float12.
100 | 
101 | processor(s) core(s) used: 10 in 1 node(s). (HT off)
102 | total time: 208.03s for 432 tokens, tok/s: 2.08
103 | 
104 | processor(s) core(s) used: 10 in 2 node(s).
105 | total time: 182.43s for 432 tokens, tok/s: 2.37
106 | 
107 | processor(s) core(s) used: 24 in 2 node(s).
108 | total time: 181.87s for 432 tokens, tok/s: 2.38
109 | 
110 | processor(s) core(s) used: 22 in 2 node(s).
111 | total time: 137.13s for 432 tokens, tok/s: 3.15
112 | 
113 | processor(s) core(s) used: 20 in 2 node(s).
114 | total time: 122.16s for 432 tokens, tok/s: 3.54
115 | 
116 | processor(s) core(s) used: 18 in 2 node(s).
117 | total time: 144.04s for 432 tokens, tok/s: 3.00
118 | 
119 | HT on
120 | 
121 | torch float type: bf16
122 | processor(s) core(s) used: 24 in 2 node(s).
123 | total time: 83.26s for 282 tokens, tok/s: 3.39
124 | 
125 | processor(s) core(s) used: 22 in 2 node(s).
126 | total time: 98.09s for 282 tokens, tok/s: 2.87
127 | 
128 | model weights converted to float12.
129 | processor(s) core(s) used: 24 in 2 node(s).
130 | total time: 134.55s for 432 tokens, tok/s: 3.21
131 | 
132 | processor(s) core(s) used: 22 in 2 node(s).
133 | total time: 132.07s for 432 tokens, tok/s: 3.27
134 | 
135 | processor(s) core(s) used: 20 in 2 node(s).
136 | total time: 139.39s for 432 tokens, tok/s: 3.10
137 | 
138 | 
139 | // modif numa
140 | torch float type: bf16
141 | processor(s) used: 24 in 2 node(s).
142 | node 0 procs: 0,2,4,6,8,10,12,14,16,18,20,22,
143 | node 1 procs: 24,26,28,30,32,34,36,38,40,42,44,46,
144 | total time: 114.35s for 282 tokens, tok/s: 2.47
145 | 
146 | processor(s) used: 22 in 2 node(s).
147 | total time: 95.71s for 282 tokens, tok/s: 2.95
148 | 
149 | model weights converted to float12.
150 | numa node(s): 2, mp node: 1, num logical/physical procs.: 48/24 (HT on)
151 | processor(s) used: 22 in 2 node(s).
152 | total time: 125.69s for 432 tokens, tok/s: 3.44
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/src/matmul/matmul_f16.c:
--------------------------------------------------------------------------------
  1 | #include <intrin.h>
  2 | #include "mm_hsum.h"
  3 | #include "w_types.h"
  4 | #include "matmul.h"
  5 | #include "matmul_priv.h"
  6 | 
  7 | // ------------------------------------------------------------------
  8 | // conversion f16 => f32
  9 | // ------------------------------------------------------------------
 10 | 
 11 | static float *lut_f16_to_f32 = NULL;
 12 | 
 13 | // must be used to create conversion lut only
 14 | static void cvt_f16_to_f32_fpu(float *f32, const f16_t *f16, size_t ne)
 15 | {
 16 |   size_t i;
 17 |   for (i=0; i!=ne; i++)
 18 |     f32[i] = lut_f16_to_f32[f16[i]];
 19 | }
 20 | 
 21 | static void cvt_f16_to_f32_sse(float *f32, const f16_t *f16, size_t ne)
 22 | {
 23 |   size_t i;
 24 |   for (i=0; i!=ne; i+=4)
 25 |     _mm_store_ps(f32 + i, _mm_cvtph_ps(_mm_loadl_epi64((__m128i *)(f16 + i))));
 26 | }
 27 | 
 28 | static void cvt_f16_to_f32_avx1(float *f32, const f16_t *f16, size_t ne)
 29 | {
 30 |   size_t i;
 31 |   for (i=0; i!=ne; i+=8)
 32 |     _mm256_store_ps(f32 + i, _mm256_cvtph_ps(_mm_load_si128((__m128i *)(f16 + i))));
 33 | }
 34 | 
 35 | const cvt_f16_to_f32_t cvt_f16_to_f32_procs[simd_n] =
 36 | {
 37 |   cvt_f16_to_f32_fpu,
 38 |   cvt_f16_to_f32_sse,
 39 |   cvt_f16_to_f32_avx1,
 40 |   NULL,
 41 | };
 42 | 
 43 | // ------------------------------------------------------------------
 44 | // matmul f32 * f16 => f32
 45 | // ------------------------------------------------------------------
 46 | 
 47 | // is very slow, usable for very small models
 48 | static void matmul_f32_f16_fpu(float *res, const float *vec, const f16_t *mat, int len_vec, int y_mat)
 49 | {
 50 |   const f16_t *m, *m_end = mat + y_mat * len_vec;
 51 |   for (m=mat; m!=m_end; m+=len_vec)
 52 |   {
 53 |     float acc = 0;
 54 |     int i;
 55 |     for (i=0; i!=len_vec; i++)
 56 |       acc += vec[i] * lut_f16_to_f32[m[i]];
 57 |     *res++ = acc;
 58 |   }
 59 | }
 60 | 
 61 | static void matmul_f32_f16_sse(float *res, const float *vec, const f16_t *mat, int len_vec, int y_mat)
 62 | {
 63 |   const f16_t *m, *m_end = mat + y_mat * len_vec;
 64 |   for (m=mat; m!=m_end; m+=len_vec)
 65 |   {
 66 |     __m128 acc = _mm_setzero_ps();              // init 0 in sum
 67 |     int i;
 68 |     for (i=0; i!=len_vec; i+=4)
 69 |       acc = _mm_fmadd_ps(_mm_cvtph_ps(_mm_loadl_epi64((__m128i *)(m + i))), _mm_load_ps(vec + i), acc);
 70 |     *res++ = hsum_ps_sse(acc);
 71 |   }
 72 | }
 73 | 
 74 | static void matmul_f32_f16_avx1(float *res, const float *vec, const f16_t *mat, int len_vec, int y_mat)
 75 | {
 76 |   const f16_t *m, *m_end = mat + y_mat * len_vec;
 77 |   for (m=mat; m!=m_end; m+=len_vec)
 78 |   {
 79 |     __m256 acc = _mm256_setzero_ps();
 80 |     int i;
 81 |     for (i=0; i!=len_vec; i+=8)
 82 |       acc = _mm256_fmadd_ps(_mm256_cvtph_ps(_mm_load_si128((__m128i *)(m + i))), _mm256_load_ps(vec + i), acc);
 83 |     *res++ = hsum_ps_avx1(acc);
 84 |   }
 85 | }
 86 | 
 87 | // init functions list
 88 | const matmul_f32_f16_t matmul_f32_f16_procs[simd_n] =
 89 | {
 90 |   matmul_f32_f16_fpu,
 91 |   matmul_f32_f16_sse,
 92 |   matmul_f32_f16_avx1,
 93 |   NULL,
 94 | };
 95 | 
 96 | // ------------------------------------------------------------------
 97 | // F16 conversions
 98 | // ------------------------------------------------------------------
 99 | 
100 | #include "l_util.h"
101 | #include "mem_alloc.h"
102 | 
103 | // --------------------------------------------------------
104 | // software conversion f16 to f32 if no CPU support of F16C
105 | // (opterons 62xx, xeon E55xx, x56xx, xeon E5 v1, ..)
106 | // used for data conversion to sf16 only.
107 | 
108 | // software convert if no F16C support
109 | static f16_t sw_cvt_f32_to_f16(float f32)
110 | { 
111 |   const uint32_t b = (*(uint32_t*)&f32) + 0x00001000;
112 |   const uint32_t e = (b & 0x7F800000) >> 23;
113 |   uint32_t r = (b & 0x80000000) >> 16;
114 |   if (e > 101)
115 |   {
116 |     const uint32_t m = b & 0x007FFFFF;
117 |     if (e < 113) r |= (((0x007FF000 + m) >> (125-e)) + 1) >> 1;
118 |     else
119 |     {
120 |       r |= (((e - 112) << 10) & 0x7C00) | m >> 13;
121 |       if (e > 143) r |= 0x7FFF;
122 |     }
123 |   }
124 |   return (f16_t)r;
125 | }
126 | 
127 | // convert buffer f32 to f16
128 | void cvt_f32_to_f16(f16_t *f16, const float *f32, size_t ne)
129 | {
130 |   size_t i;
131 |   if (matmul_procs.cpu_f16c)
132 |   {
133 |     for (i=0; i!=ne; i+=4)
134 |     {
135 |       __m128i h4 = _mm_cvtps_ph(_mm_loadu_ps(f32 + i), _MM_FROUND_TO_NEAREST_INT);  // convert to 4 float 16
136 |       _mm_storel_epi64((__m128i *)(f16 + i), h4);
137 |     }
138 |   }
139 |   else
140 |   {
141 |     for (i=0; i!=ne; i++)
142 |       f16[i] = sw_cvt_f32_to_f16(f32[i]);
143 |   }
144 | }
145 | 
146 | static float sw_cvt_f16_to_f32(f16_t f16) 
147 | { 
148 |   const uint32_t e = (f16 & 0x7C00) >> 10;   // exponent
149 |   const uint32_t m = (f16 & 0x03FF) << 13;   // mantissa
150 |   uint32_t r = (f16 & 0x8000) << 16;
151 | 
152 |   if (e) r |= ((e + 112) << 23 | m);
153 |   else if (m)
154 |   {
155 |     const float f = (float)m;
156 |     const uint32_t v = (*(uint32_t*)&f)>>23;
157 |     r |= ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000));
158 |   }
159 |   return *(float *)&r;
160 | }
161 | 
162 | void init_sw_f16c(void)
163 | {
164 |   int i;
165 |   lut_f16_to_f32 = malloc_check(N_64K*sizeof(float));
166 |   for (i=0; i<N_64K; i++)
167 |     lut_f16_to_f32[i] = sw_cvt_f16_to_f32(i);
168 | }
169 | 
170 | void free_sw_f16c(void)
171 | {
172 |   free_check(lut_f16_to_f32);
173 |   lut_f16_to_f32 = NULL;
174 | }


--------------------------------------------------------------------------------
/src/matmul/matmul_bf16.c:
--------------------------------------------------------------------------------
  1 | #include <intrin.h>
  2 | #include "mm_hsum.h"
  3 | #include "w_types.h"
  4 | #include "matmul.h"
  5 | 
  6 | // ------------------------------------------------------------------
  7 | // conversion bf16 => f32
  8 | // ------------------------------------------------------------------
  9 | 
 10 | static void cvt_bf16_to_f32_fpu(float *f32, const bf16_t *bf16, size_t ne)
 11 | {
 12 |   int *ps = (int *)f32;
 13 |   size_t i;
 14 |   for (i=0; i<ne; i++)
 15 |     ps[i] = (unsigned int)bf16[i] << 16;
 16 | }
 17 | 
 18 | static void cvt_bf16_to_f32_sse(float *f32, const bf16_t *bf16, size_t ne)
 19 | {
 20 |   size_t i;
 21 |   for (i=0; i!=ne; i+=4)
 22 |   {
 23 |     __m128i pi = _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)(bf16 + i)));
 24 |     _mm_store_ps(f32 + i, _mm_castsi128_ps(_mm_slli_epi32(pi, 16)));
 25 |   }
 26 | }
 27 | 
 28 | #define GET_8BF16_AVX1(d) _mm256_castsi256_ps(_mm256_set_m128i(_mm_unpackhi_epi16(_mm_setzero_si128(), d), _mm_unpacklo_epi16(_mm_setzero_si128(), d)))
 29 | 
 30 | // todo: use shuffle ?
 31 | static void cvt_bf16_to_f32_avx1(float *f32, const bf16_t *bf16, size_t ne)
 32 | {
 33 |   size_t i;
 34 |   for (i=0; i!=ne; i+=8)
 35 |   {
 36 |     __m128i d = _mm_load_si128((__m128i *)(bf16 + i));
 37 |     _mm256_store_ps(f32 + i, GET_8BF16_AVX1(d));
 38 |   }
 39 | }
 40 | 
 41 | #define GET_8BF16_AVX2(d) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(d), 16))
 42 | 
 43 | static void cvt_bf16_to_f32_avx2(float *f32, const bf16_t *bf16, size_t ne)
 44 | {
 45 |   size_t i;
 46 |   for (i=0; i!=ne; i+=8)
 47 |   {
 48 |     __m128i d = _mm_load_si128((__m128i *)(bf16 + i));
 49 |     _mm256_store_ps(f32 + i, GET_8BF16_AVX2(d));
 50 |   }
 51 | }
 52 | 
 53 | const cvt_bf16_to_f32_t cvt_bf16_to_f32_procs[simd_n] =
 54 | {
 55 |   cvt_bf16_to_f32_fpu,
 56 |   cvt_bf16_to_f32_sse,
 57 |   cvt_bf16_to_f32_avx1,
 58 |   cvt_bf16_to_f32_avx2,
 59 | };
 60 | 
 61 | // ------------------------------------------------------------------
 62 | // f32 * bf16 => f32
 63 | // ------------------------------------------------------------------
 64 | 
 65 | static void matmul_f32_bf16_fpu(float *res, const float *vec, const bf16_t *mat, int len_vec, int y_mat)
 66 | {
 67 |   const bf16_t *m, *m_end = mat + y_mat * len_vec;
 68 |   for (m=mat; m!=m_end; m+=len_vec)
 69 |   {
 70 |     float acc = 0;
 71 |     int i;
 72 |     for (i=0; i!=len_vec; i++)
 73 |     {
 74 |       unsigned int _f = m[i] << 16;
 75 |       acc += vec[i] * *(float *)&_f;
 76 |     }
 77 |     *res++ = acc;
 78 |   }
 79 | }
 80 | 
 81 | static void matmul_f32_bf16_sse(float *res, const float *vec, const bf16_t *mat, int len_vec, int y_mat)
 82 | {
 83 |   const bf16_t *m, *m_end = mat + y_mat * len_vec;
 84 |   for (m=mat; m!=m_end; m+=len_vec)
 85 |   {
 86 |     __m128 acc0 = _mm_setzero_ps();
 87 |     __m128 acc1 = _mm_setzero_ps();
 88 |     __m128 acc2 = _mm_setzero_ps();
 89 |     __m128 acc3 = _mm_setzero_ps();
 90 |     int i;
 91 |     for (i=0; i!=len_vec; i+=16)
 92 |     {
 93 |       __m128i d0 = _mm_load_si128((__m128i *)(m + i));
 94 |       __m128 ps_l0 = _mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), d0));
 95 |       __m128 ps_h0 = _mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), d0));
 96 |       __m128i d1 = _mm_load_si128((__m128i *)(m + i + 8));
 97 |       __m128 ps_l1 = _mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), d1));
 98 |       __m128 ps_h1 = _mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), d1));
 99 |       acc0 = _mm_fmadd_ps(ps_l0, _mm_load_ps(vec + i     ), acc0);
100 |       acc1 = _mm_fmadd_ps(ps_h0, _mm_load_ps(vec + i + 4 ), acc1);
101 |       acc2 = _mm_fmadd_ps(ps_l1, _mm_load_ps(vec + i + 8 ), acc2);
102 |       acc3 = _mm_fmadd_ps(ps_h1, _mm_load_ps(vec + i + 12), acc3);
103 |     }
104 |     *res++ = hsum_ps_sse_4x(acc0,acc1,acc2,acc3);
105 |   }
106 | }
107 | 
108 | static void matmul_f32_bf16_avx1(float *res, const float *vec, const bf16_t *mat, int len_vec, int y_mat)
109 | {
110 |   const bf16_t *m, *m_end = mat + y_mat * len_vec;
111 |   for (m=mat; m!=m_end; m+=len_vec)
112 |   {
113 |     __m256 acc0 = _mm256_setzero_ps();
114 |     __m256 acc1 = _mm256_setzero_ps();
115 | 
116 |     int i;
117 |     for (i=0; i!=len_vec; i+=16)
118 |     {
119 |       __m128i d0 = _mm_load_si128((__m128i *)(m + i    ));
120 |       __m128i d1 = _mm_load_si128((__m128i *)(m + i + 8));
121 |       acc0 = _mm256_fmadd_ps(GET_8BF16_AVX1(d0), _mm256_load_ps(vec + i    ), acc0);
122 |       acc1 = _mm256_fmadd_ps(GET_8BF16_AVX1(d1), _mm256_load_ps(vec + i + 8), acc1);
123 |     }
124 |     *res++ = hsum_ps_avx_2x(acc0, acc1);
125 |   }
126 | }
127 | 
128 | static void matmul_f32_bf16_avx2(float *res, const float *vec, const bf16_t *mat, int len_vec, int y_mat)
129 | {
130 |   const bf16_t *m, *m_end = mat + y_mat * len_vec;
131 |   for (m=mat; m!=m_end; m+=len_vec)
132 |   {
133 |     __m256 acc0 = _mm256_setzero_ps();
134 |     __m256 acc1 = _mm256_setzero_ps();
135 | 
136 |     int i;
137 |     for (i=0; i!=len_vec; i+=16)
138 |     {
139 |       __m128i d0 = _mm_load_si128((__m128i *)(m + i    ));
140 |       __m128i d1 = _mm_load_si128((__m128i *)(m + i + 8));
141 |       acc0 = _mm256_fmadd_ps(GET_8BF16_AVX2(d0), _mm256_load_ps(vec + i    ), acc0);
142 |       acc1 = _mm256_fmadd_ps(GET_8BF16_AVX2(d1), _mm256_load_ps(vec + i + 8), acc1);
143 |     }
144 |     *res++ = hsum_ps_avx_2x(acc0, acc1);
145 |   }
146 | }
147 | 
148 | // init functions list
149 | const matmul_f32_bf16_t matmul_f32_bf16_procs[simd_n] =
150 | {
151 |   matmul_f32_bf16_fpu,
152 |   matmul_f32_bf16_sse,
153 |   matmul_f32_bf16_avx1,
154 |   matmul_f32_bf16_avx2
155 | };
156 | 


--------------------------------------------------------------------------------
/src/matmul/tr_opt_simd.c:
--------------------------------------------------------------------------------
  1 | // simd optimized head attention for transformer
  2 | 
  3 | #ifdef USE_SA_SIMD
  4 | 
  5 | #include <math.h>
  6 | #include <intrin.h>
  7 | #include <stdbool.h>
  8 | #include "mm_hsum.h"
  9 | #include "transformer.h"
 10 | #include "matmul.h"
 11 | #include "tr_opt_simd.h"
 12 | 
 13 | static void head_att_opt_fpu(float *xb, int n_tok, float *att, const float *q, const float *k, const float *v, const struct transformer_config_t *p)
 14 | {
 15 |   float att_max = -1e10;                         // softmax max att value
 16 |   float att_e_sum = 0;                           // softmax exp diff sum
 17 |   int kv_dim = p->kv_dim;
 18 |   int head_size = p->head_size;
 19 |   float sqrt_head_size = p->sqrt_head_size;
 20 | 
 21 |   int t;
 22 |   for (t=0; t<n_tok; t++, k += kv_dim)
 23 |   {
 24 |     // 1 line matrix is used for dot product
 25 |     float *a = &att[t];
 26 |     matmul_procs.matmul_f32_f32(a, q, k, head_size, 1); 
 27 |     if (*a > att_max)
 28 |       att_max = *a;                              // softmax max att value
 29 |   }
 30 | 
 31 |   // softmax the scores to get attention weights, from 0..pos inclusively
 32 |   for (t=0; t<n_tok; t++)                        // softmax exp and sum
 33 |   {
 34 |     float e = expf((att[t] - att_max)/sqrt_head_size);
 35 |     att[t] = e;
 36 |     att_e_sum += e;
 37 |   }
 38 | 
 39 |   // weighted sum of the values, accumulate xb for t = 0..pos inclusively
 40 |   for (t=0; t<n_tok; t++, v += kv_dim)
 41 |   {
 42 |     int j;
 43 |     float a = att[t] / att_e_sum;
 44 |     if (!t)
 45 |       for (j=0; j<head_size; j++) xb[j]  = a * v[j];  // t = 0, init xb
 46 |     else
 47 |       for (j=0; j<head_size; j++) xb[j] += a * v[j];  // t > 0, accumulate xb
 48 |   }
 49 | }
 50 | 
 51 | // sse
 52 | static void head_att_opt_sse(float *xb, int n_tok, float *att, const float *q, const float *k, const float *v, const struct transformer_config_t *p)
 53 | {
 54 |   float att_max = -1e10;                         // softmax max att value
 55 |   float att_e_sum = 0;                           // softmax exp diff sum
 56 |   int kv_dim = p->kv_dim;
 57 |   int head_size = p->head_size;
 58 |   int t;
 59 |   float sqrt_head_size = p->sqrt_head_size;
 60 | 
 61 |   const float *m, *m_end = k + n_tok * kv_dim;
 62 |   float *a = att;
 63 |   for (m=k; m!=m_end; m+=kv_dim)
 64 |   {
 65 |     __m128 acc = _mm_setzero_ps();
 66 |     float r;
 67 |     int i;
 68 |     for (i=0; i!=head_size; i+=4)
 69 |       acc = _mm_fmadd_ps(_mm_load_ps(q + i), _mm_load_ps(m + i), acc);
 70 |     r = hsum_ps_sse(acc);
 71 |     *a++ = r;
 72 |     if (r > att_max)
 73 |       att_max = r;
 74 |   }
 75 | 
 76 |   // softmax the scores to get attention weights, from 0..pos inclusively
 77 |   for (t=0; t<n_tok; t++)                        // softmax exp and sum
 78 |   {
 79 |     float e = expf((att[t] - att_max)/sqrt_head_size);
 80 |     att[t] = e;
 81 |     att_e_sum += e;
 82 |   }
 83 | 
 84 |   // weighted sum of the values, accumulate xb for t = 0..pos inclusively
 85 |   for (t=0; t<n_tok; t++, v += kv_dim)
 86 |   {
 87 |     float a = att[t] / att_e_sum;
 88 |     __m128 _a = _mm_set1_ps(a);
 89 |     if (!t)
 90 |     {
 91 |       int j;
 92 |       for (j=0; j<head_size; j+=4)
 93 |         _mm_store_ps(xb + j, _mm_mul_ps(_a, _mm_load_ps(v + j)));
 94 |     }
 95 |     else
 96 |     {
 97 |       int j;
 98 |       for (j=0; j<head_size; j+=4)
 99 |         _mm_store_ps(xb + j, _mm_add_ps(_mm_load_ps(xb + j), _mm_mul_ps(_a, _mm_load_ps(v + j))));
100 |     }
101 |   }
102 | }
103 | 
104 | // avx/avx2
105 | static void head_att_opt_avx(float *xb, int n_tok, float *att, const float *q, const float *k, const float *v, const struct transformer_config_t *p)
106 | {
107 |   float att_max = -1e10;                         // softmax max att value
108 |   float att_e_sum = 0;                           // softmax exp diff sum
109 |   int kv_dim = p->kv_dim;
110 |   int head_size = p->head_size;
111 |   int t;
112 |   float sqrt_head_size = p->sqrt_head_size;
113 | 
114 |   const float *m, *m_end = k + n_tok * kv_dim;
115 |   float *a = att;
116 |   for (m=k; m!=m_end; m+=kv_dim)
117 |   {
118 |     __m256 acc = _mm256_setzero_ps();
119 |     float r;
120 |     int i;
121 |     for (i=0; i!=head_size; i+=8)
122 |       acc = _mm256_fmadd_ps(_mm256_load_ps(q + i), _mm256_load_ps(m + i), acc);
123 |     r = hsum_ps_avx1(acc);
124 |     *a++ = r;
125 |     if (r > att_max)
126 |       att_max = r;
127 |   }
128 | 
129 |   // softmax the scores to get attention weights, from 0..pos inclusively
130 |   for (t=0; t<n_tok; t++)                        // softmax exp and sum
131 |   {
132 |     float e = expf((att[t] - att_max)/sqrt_head_size);
133 |     att[t] = e;
134 |     att_e_sum += e;
135 |   }
136 | 
137 |   // weighted sum of the values, accumulate xb for t = 0..pos inclusively
138 |   for (t=0; t<n_tok; t++, v += kv_dim)
139 |   {
140 |     float a = att[t] / att_e_sum;
141 |     __m256 _a = _mm256_set1_ps(a);
142 |     if (!t)
143 |     {
144 |       int j;
145 |       for (j=0; j<head_size; j+=8)
146 |         _mm256_store_ps(xb + j, _mm256_mul_ps(_a, _mm256_load_ps(v + j)));
147 |     }
148 |     else
149 |     {
150 |       int j;
151 |       for (j=0; j<head_size; j+=8)
152 |         _mm256_store_ps(xb + j, _mm256_add_ps(_mm256_load_ps(xb + j), _mm256_mul_ps(_a, _mm256_load_ps(v + j))));
153 |     }
154 |   }
155 | }
156 | 
157 | head_att_opt_t head_att_opt = NULL;
158 | 
159 | void init_head_att_opt(enum e_simd_typ simd_typ)
160 | {
161 |   if (simd_typ >= simd_avx1)
162 |     head_att_opt = head_att_opt_avx;
163 |   else
164 |   if (simd_typ == simd_sse)
165 |     head_att_opt = head_att_opt_sse;
166 |   else
167 |     head_att_opt = head_att_opt_fpu;
168 | }
169 | 
170 | #endif // USE_SA_SIMD
171 | 


--------------------------------------------------------------------------------
/run_json/run_llama1.json:
--------------------------------------------------------------------------------
  1 | // llm run parameters. note: this file must be saved in utf-8 format
  2 | {
  3 | // ------------------------------------
  4 | // model identifier
  5 | 
  6 | "model_ident": "llama1",     // model type for model specificities, refer to model.c for list
  7 | 
  8 | // ------------------------------------
  9 | // model load
 10 | 
 11 | "model_num_safetensors": 2,  // count of .safetensors files in model
 12 | "model_path": "D:/llama1_st/7b", // path to .safetensors, config.json
 13 | 
 14 | // name of tokenizer
 15 | "tokenizer_name": "",        // if empty, model_path/tokenizer.json is used
 16 | 
 17 | // ------------------------------------
 18 | // transformer parameters
 19 | 
 20 | // rope value
 21 | "rope_set": 0,               // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value)
 22 | 
 23 | // ------------------------------------
 24 | // sampler parameters
 25 | 
 26 | "temperature": 0.7,          // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable)
 27 | "topp": 0.75,                // 0.01 to 0.99: max probability sum of top tokens
 28 | "topk": 25,                  // (integer) limit size of top tokens list 5..200 (0 = disable)
 29 | "topp_minp": 0.05,           // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
 30 | "topp_eos": true,            // true: limit topp list size to token with probability >= EOS
 31 | "repeat_penalty": 0.05,      // 0.0..2.0 repeat penalty (0.0 = disable)
 32 | "repeat_penalty_n": 50,      // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
 33 | "eos_amp": 0.5,              // 0.0 to 2.0 amplify eos probability when more than eos_amp_n tokens generated. (0 = disable)
 34 | "eos_amp_n": 150,            // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
 35 | "rand_seed": 1234,           // (integer) random seed
 36 | 
 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars.
 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñÑªº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²",
 39 | 
 40 | "test_nan_logits": false,    // test for NAN at sampling in logits (debug, problem detect)
 41 | 
 42 | // ------------------------------------
 43 | 
 44 | // model load data conversion
 45 | "cvt_sf16": false,           // convert model to sf16 (require f16 model)
 46 | "cvt_f12": false,            // convert model to float12
 47 | "cvt_f8": false,             // convert model to float8
 48 | 
 49 | // hardware parameters
 50 | "num_procs": 12,             // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances
 51 | "numa_nodes": -1,            // <=0: all auto detected. >0: max nodes to use
 52 | "simd_mode": -1,             // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2
 53 | 
 54 | // run parameters
 55 | "run_mode": 0,               // 0: generate, 1:chat
 56 | "gen_run_steps": -1,         // generate mode run steps: <=0: model max context size, >0:user value
 57 | "token_eos_str": "</s>",     // end of string token (assistant reply end)
 58 | "token_eot_str": "</s>",     // end of text token (dialog/generate end)
 59 | 
 60 | // tokens display options in chat or generate mode
 61 | "tok_disp_raw": false,       // true: display special tokens (LF,<s>, etc..)
 62 | "tok_disp_split": false,     // true: display tokens separated with ','
 63 | "tok_disp_prob": false,      // true: display sampling info (add [score + n topp] + ',')
 64 | 
 65 | // ------------------------------------
 66 | // generate mode prompt init
 67 | 
 68 | "gen_mode_prompt": "<s> The explanation for the existence of seasons is",
 69 | 
 70 | // ------------------------------------
 71 | // chat mode config
 72 | 
 73 | // dialog colors (r.g.b format)
 74 | "chat_use_colors": true,               // use colors for chat
 75 | "chat_col_msg":       "250.250.250",   // messages text color
 76 | "chat_col_user":      "180.255.180",   // user text color (keyboard input)
 77 | "chat_col_assistant": "180.180.255",   // assistant answer text color
 78 | 
 79 | // forward: define what is displayed when forward user prompt
 80 | "fwd_disp_mode": 0,                    // 0: display nothing, 1:tokens list (test usage)
 81 | 
 82 | // ------------------------------------
 83 | // promp mode: define the method to generate the prompt format
 84 | // 0: use model_ident value to select templates defined in chat.c
 85 | // 1: user defined templates cm1_xxx..
 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat).
 87 | "chat_prompt_mode": 2,                 // mode = 2 required for llama1
 88 | 
 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2)
 90 | "chat_assistant_name": "Llama:",
 91 | "chat_user_name": "User:",
 92 | 
 93 | // ------------------------------------
 94 | // init prompt mode 2 (generate mode)
 95 | // for mode 2 correct work:
 96 | //  - ensure names coherence in user_template/sys_prompt/user_name_sw
 97 | //  - no space at end of cm2_user_name_sw
 98 | //  - terminate sys prompt to user name
 99 | 
100 | // templates for mode 2
101 | "cm2_sys_template": "<s> %s",      // %s replaced by cm2_sys_prompt, <s> = emit bos
102 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next using keyboard input string
103 | "cm2_user_name_sw": "\nUser:",     // user name switch (end template)
104 | 
105 | // llama1 generate type chat example
106 | "cm2_sys_prompt":
107 |      "Transcript of a dialog, where the User interacts with an assistant named Bob. "
108 |     +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n"
109 |     +"User: Hello Bob.\n"
110 |     +"Bob: Hello. How may I help you today?\n"
111 |     +"User:",
112 | "cm2_user_prompt": "What is sizeof(int) value in C ?"
113 | }
114 | 


--------------------------------------------------------------------------------
/src/utils/numa_w.c:
--------------------------------------------------------------------------------
  1 | // numa infos for windows
  2 | 
  3 | #include <windows.h>
  4 | #include "l_util.h"          // msg_error
  5 | #include "mem_alloc.h"
  6 | #include "numa.h"
  7 | 
  8 | // --------------------------------------
  9 | // get some processors/numa configuration
 10 | // note: processor group not managed, will return only processors in current group (max 64)
 11 | 
 12 | struct numa_inf_t numa = { 0 };
 13 | 
 14 | void init_numa_info(void)
 15 | {
 16 |   DWORD sz = 0;
 17 |   uint64_t p_msk = 0;              // physical processors mask
 18 |   int i, j, lc = 0;                // not physical processors count (ht)
 19 |   unsigned char node_plist[MAX_NUMA_NODES][MAX_NUMA_PROCS];
 20 | 
 21 |   if (!GetLogicalProcessorInformation(NULL, &sz))
 22 |   {
 23 |     if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
 24 |     {
 25 |       SYSTEM_LOGICAL_PROCESSOR_INFORMATION *pi, *pi_buff;
 26 |       pi_buff = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION *)malloc_check(sz);
 27 |       if (GetLogicalProcessorInformation(pi_buff, &sz))
 28 |       {
 29 |         char *pi_end = (char *)pi_buff + sz;
 30 | 
 31 |         // get one physical processor if HT enabled, define p_msk
 32 |         for (pi = pi_buff; (char *)pi < pi_end; pi++)
 33 |         {
 34 |           if (pi->Relationship == RelationProcessorCore)
 35 |           {
 36 |             // get one processor in mask
 37 |             uint64_t m = pi->ProcessorMask;
 38 |             uint64_t m_p = 1;                    // physical mask
 39 |             if (!m)                              // something is wrong
 40 |               break;
 41 |             while (!(m & 1))
 42 |               { m >>= 1; m_p <<= 1; }
 43 | 
 44 |             // count remaining as HT processor
 45 |             m >>= 1;                             // pass first found
 46 |             while (m)
 47 |               { lc += m & 1; m >>= 1; }
 48 | 
 49 |             if (p_msk & m_p)                     // already defined ?
 50 |               break;
 51 |             p_msk |= m_p;                        // update global mask for physicals
 52 |             numa.n_procs++;                      // cores count
 53 |           }
 54 |         }
 55 |         if ((!p_msk) || ((char *)pi != pi_end))  // a break occured
 56 |           msg_error("init_numa_info failed (1)");
 57 | 
 58 |         // get processors nodes
 59 |         for (pi = pi_buff; (char *)pi < pi_end; pi++)
 60 |         {
 61 |           if (pi->Relationship == RelationNumaNode)
 62 |           {
 63 |             uint64_t m = pi->ProcessorMask;
 64 |             uint64_t m_p = 1;
 65 |             int p_id = 0;                        // proc id
 66 |             int n_id = pi->NumaNode.NodeNumber;  // node id
 67 |             if (n_id >= MAX_NUMA_NODES)
 68 |               break;
 69 |             if (n_id >= numa.n_nodes)
 70 |               numa.n_nodes = n_id + 1;
 71 |             while (m)
 72 |             {
 73 |               if (m & 1)
 74 |               {
 75 |                 numa.proc_node[p_id] = n_id;
 76 |                 if (p_msk & m_p)                 // ignore if HT proc
 77 |                   node_plist[n_id][numa.node_nprocs[n_id]++] = p_id;
 78 |               }
 79 |               m >>= 1;
 80 |               m_p <<= 1;
 81 |               p_id++;
 82 |             }
 83 |           }
 84 |         }
 85 |       }
 86 |       free_check(pi_buff);
 87 |     }
 88 |   }
 89 | 
 90 |   // main thread
 91 |   numa.mt_node = numa.proc_node[numa_get_thread_proc()];
 92 |   numa.mt_procs = numa.node_nprocs[numa.mt_node];
 93 | 
 94 |   if (!numa.mt_procs)                            // something is wrong
 95 |     msg_error("init_numa_info failed (2)");
 96 | 
 97 |   // create sorted procs list for user, with procs for main thread at begin
 98 |   memcpy(numa.proc_list, node_plist[numa.mt_node], numa.mt_procs);
 99 |   memset(numa.proc_node, numa.mt_node, numa.mt_procs);
100 |   j = numa.mt_procs;
101 |   for (i=0; i<numa.n_nodes; i++)
102 |   {
103 |     if (i != numa.mt_node)
104 |     {
105 |       int np = numa.node_nprocs[i];
106 |       if (!np)                                     // something is wrong
107 |         msg_error("node %d contain 0 processors.", i);
108 |       memcpy(numa.proc_list + j, node_plist[i], np);
109 |       memset(numa.proc_node + j, i, np);
110 |       j += np;
111 |     }
112 |   }
113 |   CHECK(j == numa.n_procs);
114 | 
115 |   // user infos
116 |   msg_info("numa node(s): %d, mp node: %d, num logical/physical procs.: %d/%d (HT %s)\n", 
117 |      numa.n_nodes, numa.mt_node, numa.n_procs+lc, numa.n_procs, lc ? "on" : "off");
118 | }
119 | 
120 | // set proc for current thread
121 | bool numa_set_thread_proc(int proc_id)
122 | {
123 |   int proc = (int)SetThreadIdealProcessor(GetCurrentThread(), proc_id);
124 |   if (proc < 0)                                  // return -1 if fail
125 |     msg_error("numa_set_thread_proc failed");
126 |   return proc;                                   // return previous processor id
127 | }
128 | 
129 | // return proc for current thread
130 | int numa_get_thread_proc(void)
131 | {
132 |   int proc = (int)SetThreadIdealProcessor(GetCurrentThread(), MAXIMUM_PROCESSORS);
133 |   if (proc < 0)                                  // return -1 if fail
134 |     msg_error("numa_get_thread_proc failed");
135 |   return (int)proc;
136 | }
137 | 
138 | // display mem available in nodes (dev usage)
139 | void numa_disp_mem(void)
140 | {
141 |   int n;
142 |   for (n=0; n<numa.n_nodes; n++)
143 |   {
144 |     ULONGLONG sz;
145 |     if (GetNumaAvailableMemoryNode((UCHAR)n, &sz))
146 |       msg_info(" - memory in node %d: %.2f Gb\n", n, (double)sz/(1024.0*1024*1024));
147 |   }
148 | }
149 | 
150 | // ------------------------------------
151 | // memory alloctions into nodes
152 | 
153 | #if 0
154 | // debug usage: temporarily replace virtual alloc to checked malloc to check for memory leaks
155 | #define VirtualAllocExNuma(p, addr, sz, flags, p_flags, node) malloc_check(sz)
156 | #define VirtualFree(p, addr, flags) (free_check(p), 1)
157 | #endif
158 | 
159 | // alloc memory in node
160 | void *numa_alloc(size_t sz, int node)
161 | {
162 |   void *p = VirtualAllocExNuma(GetCurrentProcess(), NULL, sz, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, node);
163 |   if (!p)
164 |     msg_error("numa_alloc failed");
165 |   return p;
166 | }
167 | 
168 | // free memory
169 | void numa_free(void *p)
170 | {
171 |   if (p)
172 |   {
173 |     BOOL res = VirtualFree(p, 0, MEM_RELEASE);
174 |     if (!res)
175 |       msg_info("numa_free failed.\n");
176 |   }
177 | }
178 | 


--------------------------------------------------------------------------------
/run_json/run_codellama.json:
--------------------------------------------------------------------------------
  1 | // llm run parameters. note: this file must be saved in utf-8 format
  2 | {
  3 | // ------------------------------------
  4 | // model identifier
  5 | 
  6 | "model_ident": "codellama",     // model type for model specificities, refer to model.c for list
  7 | 
  8 | // ------------------------------------
  9 | // model load
 10 | 
 11 | "model_num_safetensors": 2,  // count of .safetensors files in model
 12 | "model_path": "E:/codellama/codellama-7b-instruct-hf", // path to .safetensors, config.json
 13 | 
 14 | // name of tokenizer
 15 | "tokenizer_name": "",        // if empty, model_path/tokenizer.json is used
 16 | 
 17 | // ------------------------------------
 18 | // transformer parameters
 19 | 
 20 | // rope value
 21 | "rope_set": 0,               // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value)
 22 | 
 23 | // ------------------------------------
 24 | // sampler parameters
 25 | 
 26 | "temperature": 0.6,          // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable)
 27 | "topp": 0.90,                // 0.01 to 0.99: max probability sum of top tokens
 28 | "topk": 30,                  // (integer) limit size of top tokens list 5..200 (0 = disable)
 29 | "topp_minp": 0.05,           // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
 30 | "topp_eos": true,            // true: limit topp list size to token with probability >= EOS
 31 | "repeat_penalty": 0.0,       // 0.0..2.0 repeat penalty (0.0 = disable)
 32 | "repeat_penalty_n": 50,      // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
 33 | "eos_amp": 0.5,              // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable)
 34 | "eos_amp_n": 150,            // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
 35 | "rand_seed": 1234,           // (integer) random seed
 36 | 
 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars.
 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñÑªº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²",
 39 | 
 40 | "test_nan_logits": false,    // test for NAN at sampling in logits (debug, problem detect)
 41 | 
 42 | // ------------------------------------
 43 | 
 44 | // model load data conversion
 45 | "cvt_sf16": false,           // convert model to sf16 (require f16 model)
 46 | "cvt_f12": false,            // convert model to float12
 47 | "cvt_f8": false,             // convert model to float8
 48 | 
 49 | // hardware parameters
 50 | "num_procs": 12,             // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances
 51 | "numa_nodes": -1,            // <=0: all auto detected. >0: max nodes to use
 52 | "simd_mode": -1,             // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2
 53 | 
 54 | // run parameters
 55 | "run_mode": 0,               // 0: generate, 1:chat
 56 | "gen_run_steps": -1,         // generate mode run steps: <=0: model max context size, >0:user value
 57 | "token_eos_str": "</s>",     // end of string token (assistant reply end)
 58 | "token_eot_str": "</s>",     // end of text token (dialog/generate end)
 59 | 
 60 | // tokens display options in chat or generate mode
 61 | "tok_disp_raw": false,       // true: display special tokens (LF,<s>, etc..)
 62 | "tok_disp_split": false,     // true: display tokens separated with ','
 63 | "tok_disp_prob": false,      // true: display sampling info (add [score + n topp] + ',')
 64 | 
 65 | // ------------------------------------
 66 | // generate mode prompt init
 67 | 
 68 | "gen_mode_prompt": "<s> bool is_prime(int x)\n{",
 69 | 
 70 | // ------------------------------------
 71 | // chat mode config
 72 | 
 73 | // dialog colors (r.g.b format)
 74 | "chat_use_colors": true,               // use colors for chat
 75 | "chat_col_msg":       "250.250.250",   // messages text color
 76 | "chat_col_user":      "180.255.180",   // user text color (keyboard input)
 77 | "chat_col_assistant": "180.180.255",   // assistant answer text color
 78 | 
 79 | // forward: define what is displayed when forward user prompt
 80 | "fwd_disp_mode": 0,                    // 0: display nothing, 1:tokens list (test usage)
 81 | 
 82 | // ------------------------------------
 83 | // promp mode: define the method to generate the prompt format
 84 | // 0: use model_ident value to select templates defined in chat.c
 85 | // 1: user defined templates cm1_xxx..
 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat).
 87 | "chat_prompt_mode": 0,
 88 | 
 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2)
 90 | "chat_assistant_name": "Llcode:",
 91 | "chat_user_name": "User:",
 92 | 
 93 | // ------------------------------------
 94 | // chat_prompt_mode=0 parameters
 95 | 
 96 | "cm0_sys_prompt": "You are a chatbot who can help code.",
 97 | "cm0_user_prompt": "What is sizeof(int) value in C ?",
 98 | 
 99 | // ------------------------------------
100 | // chat_prompt_mode=1 parameters (user defined template)
101 | 
102 | // https://huggingface.co/blog/llama2#how-to-prompt-llama-2
103 | "cm1_sys_template": "<s>[INST] <<SYS>>\n%s\n<</SYS>>\n\n", // %s replace cm1_sys_prompt
104 | "cm1_user_first_template": "%s [/INST]",                   // first user template following sys prompt
105 | "cm1_user_template": "<s>[INST] %s [/INST]",               // %s replace cm1_user_prompt
106 | "cm1_end_template": "</s>\n",                              // end of assistant reply template
107 | 
108 | "cm1_sys_prompt": "You are a chatbot who can help code.",
109 | "cm1_user_prompt": "What is sizeof(int) value in C ?",
110 | 
111 | // ------------------------------------
112 | // init prompt mode 2 (generate mode)
113 | // for mode 2 correct work:
114 | //  - ensure names coherence in user_template/sys_prompt/user_name_sw
115 | //  - no space at end of cm2_user_name_sw
116 | //  - terminate sys prompt to user name
117 | 
118 | // templates for mode 2
119 | "cm2_sys_template": "<s> %s",      // %s replaced by cm2_sys_prompt, <s> = emit bos
120 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next using keyboard input string
121 | "cm2_user_name_sw": "\nUser:",     // user name switch (end template)
122 | 
123 | // llama1 generate type chat example
124 | "cm2_sys_prompt":
125 |      "Transcript of a dialog, where the User interacts with an assistant named Bob. "
126 |     +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n"
127 |     +"User: Hello Bob.\n"
128 |     +"Bob: Hello. How may I help you today?\n"
129 |     +"User:",
130 | "cm2_user_prompt": "What is sizeof(int) value in C ?"
131 | }
132 | 


--------------------------------------------------------------------------------
/run_json/run_mathstral.json:
--------------------------------------------------------------------------------
  1 | // llm run parameters. note: this file must be saved in utf-8 format
  2 | {
  3 | // ------------------------------------
  4 | // model identifier
  5 | 
  6 | "model_ident": "mathstral",  // model type for model specificities, refer to model.c for list
  7 | 
  8 | // ------------------------------------
  9 | // model load
 10 | 
 11 | "model_num_safetensors": 6,                      // count of .safetensors files
 12 | "model_path": "E:/mathstral/Mathstral-7B-v0.1",  // path to .safetensors, config.json
 13 | 
 14 | // name of tokenizer
 15 | "tokenizer_name": "",        // if empty, model_path/tokenizer.json is used
 16 | 
 17 | // ------------------------------------
 18 | // transformer parameters
 19 | 
 20 | // rope value
 21 | "rope_set": 0,               // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value)
 22 | 
 23 | // ------------------------------------
 24 | // sampler parameters
 25 | 
 26 | "temperature": 0.7,          // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable)
 27 | "topp": 0.80,                // 0.01 to 0.99: max probability sum of top tokens
 28 | "topk": 40,                  // (integer) limit size of top tokens list 5..200 (0 = disable)
 29 | "topp_minp": 0.05,           // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
 30 | "topp_eos": true,            // true: limit topp list size to token with probability >= EOS
 31 | "repeat_penalty": 0.05,      // 0.0..2.0 repeat penalty (0.0 = disable)
 32 | "repeat_penalty_n": 50,      // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
 33 | "eos_amp": 0.5,              // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable)
 34 | "eos_amp_n": 150,            // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
 35 | "rand_seed": 1234,           // (integer) random seed
 36 | 
 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars.
 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñÑªº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²",
 39 | 
 40 | "test_nan_logits": false,    // test for NAN at sampling in logits (debug, problem detect)
 41 | 
 42 | // ------------------------------------
 43 | 
 44 | // model load data conversion
 45 | "cvt_sf16": false,           // convert model to sf16 (require f16 model)
 46 | "cvt_f12": false,            // convert model to float12
 47 | "cvt_f8": false,             // convert model to float8
 48 | 
 49 | // hardware parameters
 50 | "num_procs": 12,             // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances
 51 | "numa_nodes": -1,            // <=0: all auto detected. >0: max nodes to use
 52 | "simd_mode": -1,             // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2
 53 | 
 54 | // run parameters
 55 | "run_mode": 0,               // 0: generate, 1:chat
 56 | "gen_run_steps": -1,         // generate mode run steps: <=0: model max context size, >0:user value
 57 | "token_eos_str": "</s>",     // end of string token (assistant reply end)
 58 | "token_eot_str": "</s>",     // end of text token (dialog/generate end)
 59 | 
 60 | // tokens display options in chat or generate mode
 61 | "tok_disp_raw": false,       // true: display special tokens (LF,<s>, etc..)
 62 | "tok_disp_split": false,     // true: display tokens separated with ','
 63 | "tok_disp_prob": false,      // true: display sampling info (add [score + n topp] + ',')
 64 | 
 65 | // ------------------------------------
 66 | // generate mode prompt init
 67 | 
 68 | "gen_mode_prompt": "<s> The explanation for the existence of seasons is",
 69 | 
 70 | // ------------------------------------
 71 | // chat mode config
 72 | 
 73 | // dialog colors (r.g.b format)
 74 | "chat_use_colors": true,               // use colors for chat
 75 | "chat_col_msg":       "250.250.250",   // messages text color
 76 | "chat_col_user":      "180.255.180",   // user text color (keyboard input)
 77 | "chat_col_assistant": "180.180.255",   // assistant answer text color
 78 | 
 79 | // forward: define what is displayed when forward user prompt
 80 | "fwd_disp_mode": 0,                    // 0: display nothing, 1:tokens list (test usage)
 81 | 
 82 | // ------------------------------------
 83 | // promp mode: define the method to generate the prompt format
 84 | // 0: use model_ident value to select templates defined in chat.c
 85 | // 1: user defined templates cm1_xxx..
 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat).
 87 | "chat_prompt_mode": 0,
 88 | 
 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2)
 90 | "chat_assistant_name": "Mathstral:",
 91 | "chat_user_name": "User:",
 92 | 
 93 | // ------------------------------------
 94 | // chat_prompt_mode=0 parameters
 95 | 
 96 | "cm0_sys_prompt": "",                   // todo, no sys prompt with instruct model ?
 97 | "cm0_user_prompt": "What is a pointer in C language ?",
 98 | 
 99 | // ------------------------------------
100 | // chat_prompt_mode=1 parameters (user defined template)
101 | 
102 | // todo: find prompt documentation.
103 | "cm1_sys_template": "",                           // %s replace cm1_sys_prompt
104 | "cm1_user_first_template": "",                    // first user template following sys prompt
105 | "cm1_user_template": "<s>[INST] %s [/INST]",      // %s replace cm1_user_prompt
106 | "cm1_end_template": "</s>\n",                     // end of assistant reply template
107 | 
108 | "cm1_sys_prompt": "",                             // no sys prompt with Mistral ?
109 | "cm1_user_prompt": "What is sizeof(int) value in C ?",
110 | 
111 | // ------------------------------------
112 | // init prompt mode 2 (generate mode)
113 | // for mode 2 correct work:
114 | //  - ensure names coherence in user_template/sys_prompt/user_name_sw
115 | //  - no space at end of cm2_user_name_sw
116 | //  - terminate sys prompt to user name
117 | 
118 | // templates for mode 2
119 | "cm2_sys_template": "<s> %s",      // %s replaced by cm2_sys_prompt, <s> = emit bos
120 | "cm2_user_template": " %s\nBob:",  // %s = cm2_user_prompt at init and next using keyboard input string
121 | "cm2_user_name_sw": "\nUser:",     // user name switch (end template)
122 | 
123 | // llama1 generate type chat example
124 | "cm2_sys_prompt":
125 |      "Transcript of a dialog, where the User interacts with an assistant named Bob. "
126 |     +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n"
127 |     +"User: Hello Bob.\n"
128 |     +"Bob: Hello. How may I help you today?\n"
129 |     +"User:",
130 | "cm2_user_prompt": "What is sizeof(int) value in C ?"
131 | }
132 | 


--------------------------------------------------------------------------------
/run_json/run_mistral.json:
--------------------------------------------------------------------------------
  1 | // llm run parameters. note: this file must be saved in utf-8 format
  2 | {
  3 | // ------------------------------------
  4 | // model identifier
  5 | 
  6 | "model_ident": "mistral",    // model type for model specificities, refer to model.c for list
  7 | 
  8 | // ------------------------------------
  9 | // model load
 10 | 
 11 | "model_num_safetensors": 3,                           // count of .safetensors files
 12 | "model_path": "E:/mistral/mistral-7b-instruct-v0.3",  // path to .safetensors, config.json
 13 | 
 14 | // name of tokenizer
 15 | "tokenizer_name": "",        // if empty, model_path/tokenizer.json is used
 16 | 
 17 | // ------------------------------------
 18 | // transformer parameters
 19 | 
 20 | // rope value
 21 | "rope_set": 0,               // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value)
 22 | 
 23 | // ------------------------------------
 24 | // sampler parameters
 25 | 
 26 | "temperature": 0.7,          // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable)
 27 | "topp": 0.80,                // 0.01 to 0.99: max probability sum of top tokens
 28 | "topk": 40,                  // (integer) limit size of top tokens list 5..200 (0 = disable)
 29 | "topp_minp": 0.05,           // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
 30 | "topp_eos": true,            // true: limit topp list size to token with probability >= EOS
 31 | "repeat_penalty": 0.05,      // 0.0..2.0 repeat penalty (0.0 = disable)
 32 | "repeat_penalty_n": 50,      // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
 33 | "eos_amp": 0.5,              // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable)
 34 | "eos_amp_n": 150,            // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
 35 | "rand_seed": 1234,           // (integer) random seed
 36 | 
 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars.
 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñÑªº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²",
 39 | 
 40 | "test_nan_logits": false,    // test for NAN at sampling in logits (debug, problem detect)
 41 | 
 42 | // ------------------------------------
 43 | 
 44 | // model load data conversion
 45 | "cvt_sf16": false,           // convert model to sf16 (require f16 model)
 46 | "cvt_f12": false,            // convert model to float12
 47 | "cvt_f8": false,             // convert model to float8
 48 | 
 49 | // hardware parameters
 50 | "num_procs": 12,             // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances
 51 | "numa_nodes": -1,            // <=0: all auto detected. >0: max nodes to use
 52 | "simd_mode": -1,             // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2
 53 | 
 54 | // run parameters
 55 | "run_mode": 0,               // 0: generate, 1:chat
 56 | "gen_run_steps": -1,         // generate mode run steps: <=0: model max context size, >0:user value
 57 | "token_eos_str": "</s>",     // end of string token (assistant reply end)
 58 | "token_eot_str": "</s>",     // end of text token (dialog/generate end)
 59 | 
 60 | // tokens display options in chat or generate mode
 61 | "tok_disp_raw": false,       // true: display special tokens (LF,<s>, etc..)
 62 | "tok_disp_split": false,     // true: display tokens separated with ','
 63 | "tok_disp_prob": false,      // true: display sampling info (add [score + n topp] + ',')
 64 | 
 65 | // ------------------------------------
 66 | // generate mode prompt init
 67 | 
 68 | "gen_mode_prompt": "<s> The explanation for the existence of seasons is",
 69 | 
 70 | // ------------------------------------
 71 | // chat mode config
 72 | 
 73 | // dialog colors (r.g.b format)
 74 | "chat_use_colors": true,               // use colors for chat
 75 | "chat_col_msg":       "250.250.250",   // messages text color
 76 | "chat_col_user":      "180.255.180",   // user text color (keyboard input)
 77 | "chat_col_assistant": "180.180.255",   // assistant answer text color
 78 | 
 79 | // forward: define what is displayed when forward user prompt
 80 | "fwd_disp_mode": 0,                    // 0: display nothing, 1:tokens list (test usage)
 81 | 
 82 | // ------------------------------------
 83 | // promp mode: define the method to generate the prompt format
 84 | // 0: use model_ident value to select templates defined in chat.c
 85 | // 1: user defined templates cm1_xxx..
 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat).
 87 | "chat_prompt_mode": 0,
 88 | 
 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2)
 90 | "chat_assistant_name": "Mistral:",
 91 | "chat_user_name": "User:",
 92 | 
 93 | // ------------------------------------
 94 | // chat_prompt_mode=0 parameters
 95 | 
 96 | "cm0_sys_prompt": "",                   // todo, no sys prompt with instruct model ?
 97 | "cm0_user_prompt": "What is a pointer in C language ?",
 98 | 
 99 | // ------------------------------------
100 | // chat_prompt_mode=1 parameters (user defined template)
101 | 
102 | // todo: find prompt documentation.
103 | "cm1_sys_template": "",                           // %s replace cm1_sys_prompt
104 | "cm1_user_first_template": "",                    // first user template following sys prompt
105 | "cm1_user_template": "<s>[INST] %s [/INST]",      // %s replace cm1_user_prompt
106 | "cm1_end_template": "</s>\n",                     // end of assistant reply template
107 | 
108 | "cm1_sys_prompt": "",                             // no sys prompt with Mistral ?
109 | "cm1_user_prompt": "What is sizeof(int) value in C ?",
110 | 
111 | // ------------------------------------
112 | // init prompt mode 2 (generate mode)
113 | // for mode 2 correct work:
114 | //  - ensure names coherence in user_template/sys_prompt/user_name_sw
115 | //  - no space at end of cm2_user_name_sw
116 | //  - terminate sys prompt to user name
117 | 
118 | // templates for mode 2
119 | "cm2_sys_template": "<s> %s",      // %s replaced by cm2_sys_prompt, <s> = emit bos
120 | "cm2_user_template": " %s\nBob:",  // %s = cm2_user_prompt at init and next using keyboard input string
121 | "cm2_user_name_sw": "\nUser:",     // user name switch (end template)
122 | 
123 | // llama1 generate type chat example
124 | "cm2_sys_prompt":
125 |      "Transcript of a dialog, where the User interacts with an assistant named Bob. "
126 |     +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n"
127 |     +"User: Hello Bob.\n"
128 |     +"Bob: Hello. How may I help you today?\n"
129 |     +"User:",
130 | "cm2_user_prompt": "What is sizeof(int) value in C ?"
131 | }
132 | 


--------------------------------------------------------------------------------
/run_json/run_tinyllama.json:
--------------------------------------------------------------------------------
  1 | // llm run parameters. note: this file must be saved in utf-8 format
  2 | {
  3 | // ------------------------------------
  4 | // model identifier
  5 | 
  6 | "model_ident": "tinyllama",  // model type for model specificities, refer to model.c for list
  7 | 
  8 | // ------------------------------------
  9 | // model load
 10 | 
 11 | "model_num_safetensors": 1,  // count of .safetensors files in model
 12 | "model_path": "D:/tinyllama/Tiny-Llama-1.1B-Chat-v1.0", // path to .safetensors, config.json
 13 | 
 14 | // name of tokenizer
 15 | "tokenizer_name": "",        // if empty, model_path/tokenizer.json is used
 16 | 
 17 | // ------------------------------------
 18 | // transformer parameters
 19 | 
 20 | // rope value
 21 | "rope_set": 0,               // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value)
 22 | 
 23 | // ------------------------------------
 24 | // sampler parameters
 25 | 
 26 | "temperature": 0.6,          // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable)
 27 | "topp": 0.65,                // 0.01 to 0.99: max probability sum of top tokens
 28 | "topk": 25,                  // (integer) limit size of top tokens list 5..200 (0 = disable)
 29 | "topp_minp": 0.05,           // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
 30 | "topp_eos": true,            // true: limit topp list size to token with probability >= EOS
 31 | "repeat_penalty": 0.05,      // 0.0..2.0 repeat penalty (0.0 = disable)
 32 | "repeat_penalty_n": 50,      // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
 33 | "eos_amp": 0.5,              // 0.0 to 2.0 amplify eos probability when more than eos_amp_n tokens generated. (0 = disable)
 34 | "eos_amp_n": 150,            // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
 35 | "rand_seed": 1234,           // (integer) random seed
 36 | 
 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars.
 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñÑªº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²",
 39 | 
 40 | "test_nan_logits": false,    // test for NAN at sampling in logits (debug, problem detect)
 41 | 
 42 | // ------------------------------------
 43 | 
 44 | // model load data conversion
 45 | "cvt_sf16": false,           // convert model to sf16 (require f16 model)
 46 | "cvt_f12": false,            // convert model to float12
 47 | "cvt_f8": false,             // convert model to float8 (cannot with tinyllama)
 48 | 
 49 | // hardware parameters
 50 | "num_procs": 12,             // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances
 51 | "numa_nodes": -1,            // <=0: all auto detected. >0: max nodes to use
 52 | "simd_mode": -1,             // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2
 53 | 
 54 | // run parameters
 55 | "run_mode": 0,               // 0: generate, 1:chat
 56 | "gen_run_steps": -1,         // generate mode run steps: <=0: model max context size, >0:user value
 57 | "token_eos_str": "</s>",     // end of string token (assistant reply end)
 58 | "token_eot_str": "</s>",     // end of text token (dialog/generate end)
 59 | 
 60 | // tokens display options in chat or generate mode
 61 | "tok_disp_raw": false,       // true: display special tokens (LF,<s>, etc..)
 62 | "tok_disp_split": false,     // true: display tokens separated with ','
 63 | "tok_disp_prob": false,      // true: display sampling info (add [score + n topp] + ',')
 64 | 
 65 | // ------------------------------------
 66 | // generate mode prompt init
 67 | 
 68 | "gen_mode_prompt": "<s> The explanation for the existence of seasons is",
 69 | 
 70 | // ------------------------------------
 71 | // chat mode config
 72 | 
 73 | // dialog colors (r.g.b format)
 74 | "chat_use_colors": true,               // use colors for chat
 75 | "chat_col_msg":       "250.250.250",   // messages text color
 76 | "chat_col_user":      "180.255.180",   // user text color (keyboard input)
 77 | "chat_col_assistant": "180.180.255",   // assistant answer text color
 78 | 
 79 | // forward: define what is displayed when forward user prompt
 80 | "fwd_disp_mode": 0,                    // 0: display nothing, 1:tokens list (test usage)
 81 | 
 82 | // ------------------------------------
 83 | // promp mode: define the method to generate the prompt format
 84 | // 0: use model_ident value to select templates defined in chat.c
 85 | // 1: user defined templates cm1_xxx..
 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat).
 87 | "chat_prompt_mode": 0,
 88 | 
 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2)
 90 | "chat_assistant_name": "Tiny:",
 91 | "chat_user_name": "User:",
 92 | 
 93 | // ------------------------------------
 94 | // chat_prompt_mode=0 parameters
 95 | 
 96 | "cm0_sys_prompt": "You are a chatbot who can help code.",
 97 | "cm0_user_prompt": "What is sizeof(int) value in C ?",
 98 | 
 99 | // ------------------------------------
100 | // chat_prompt_mode=1 parameters (user defined template)
101 | 
102 | // https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0
103 | "cm1_sys_template": "<|system|>\n%s</s>\n",               // %s replace cm1_sys_prompt
104 | "cm1_user_first_template": "",                            // first user template following sys prompt
105 | "cm1_user_template": "<|user|>\n%s</s>\n<|assistant|>\n", // %s replace cm1_user_prompt
106 | "cm1_end_template": "</s>\n",                             // end of assistant reply template
107 | 
108 | "cm1_sys_prompt": "You are a chatbot who can help code.",
109 | "cm1_user_prompt": "What is sizeof(int) value in C ?",
110 | 
111 | // ------------------------------------
112 | // init prompt mode 2 (generate mode)
113 | // for mode 2 correct work:
114 | //  - ensure names coherence in user_template/sys_prompt/user_name_sw
115 | //  - no space at end of cm2_user_name_sw
116 | //  - terminate sys prompt to user name
117 | 
118 | // templates for mode 2
119 | "cm2_sys_template": "<s> %s",      // %s replaced by cm2_sys_prompt, <s> = emit bos
120 | "cm2_user_template": " %s\nTiny:", // %s = cm2_user_prompt at init and next using keyboard input string
121 | "cm2_user_name_sw": "\nUser:",     // user name switch (end template)
122 | 
123 | // llama1 generate type chat example
124 | "cm2_sys_prompt":
125 |      "Transcript of a dialog, where the User interacts with an assistant named Tiny. "
126 |     +"Tiny is good at computer programming and never fails to respond to user requests accurately.\n\n"
127 |     +"User: Hello Tiny.\n"
128 |     +"Tiny: Hello. How may I help you today?\n"
129 |     +"User:",
130 | "cm2_user_prompt": "What is sizeof(int) value in C ?"
131 | }
132 | 


--------------------------------------------------------------------------------
/run_json/run_zephyr.json:
--------------------------------------------------------------------------------
  1 | // llm run parameters. note: this file must be saved in utf-8 format
  2 | {
  3 | // ------------------------------------
  4 | // model identifier
  5 | 
  6 | "model_ident": "zephyr",     // model type for model specificities, refer to model.c for list
  7 | 
  8 | // ------------------------------------
  9 | // model load
 10 | 
 11 | "model_num_safetensors": 8,                // count of .safetensors files
 12 | "model_path": "E:/zephyr/zephyr-7b-beta",  // path to .safetensors, config.json
 13 | 
 14 | // name of tokenizer
 15 | "tokenizer_name": "",        // if empty, model_path/tokenizer.json is used
 16 | 
 17 | // ------------------------------------
 18 | // transformer parameters
 19 | 
 20 | // rope value
 21 | "rope_set": 0,               // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value)
 22 | 
 23 | // ------------------------------------
 24 | // sampler parameters
 25 | 
 26 | "temperature": 0.7,          // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable)
 27 | "topp": 0.80,                // 0.01 to 0.99: max probability sum of top tokens
 28 | "topk": 40,                  // (integer) limit size of top tokens list 5..200 (0 = disable)
 29 | "topp_minp": 0.05,           // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
 30 | "topp_eos": true,            // true: limit topp list size to token with probability >= EOS
 31 | "repeat_penalty": 0.05,      // 0.0..2.0 repeat penalty (0.0 = disable)
 32 | "repeat_penalty_n": 50,      // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
 33 | "eos_amp": 0.5,              // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable)
 34 | "eos_amp_n": 150,            // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
 35 | "rand_seed": 1234,           // (integer) random seed
 36 | 
 37 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars.
 38 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñÑªº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²",
 39 | 
 40 | "test_nan_logits": false,    // test for NAN at sampling in logits (debug, problem detect)
 41 | 
 42 | // ------------------------------------
 43 | 
 44 | // model load data conversion
 45 | "cvt_sf16": false,           // convert model to sf16 (require f16 model)
 46 | "cvt_f12": false,            // convert model to float12
 47 | "cvt_f8": false,             // convert model to float8
 48 | 
 49 | // hardware parameters
 50 | "num_procs": 12,             // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances
 51 | "numa_nodes": -1,            // <=0: all auto detected. >0: max nodes to use
 52 | "simd_mode": -1,             // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2
 53 | 
 54 | // run parameters
 55 | "run_mode": 0,               // 0: generate, 1:chat
 56 | "gen_run_steps": -1,         // generate mode run steps: <=0: model max context size, >0:user value
 57 | "token_eos_str": "</s>",     // end of string token (assistant reply end)
 58 | "token_eot_str": "</s>",     // end of text token (dialog/generate end)
 59 | 
 60 | // tokens display options in chat or generate mode
 61 | "tok_disp_raw": false,       // true: display special tokens (LF,<s>, etc..)
 62 | "tok_disp_split": false,     // true: display tokens separated with ','
 63 | "tok_disp_prob": false,      // true: display sampling info (add [score + n topp] + ',')
 64 | 
 65 | // ------------------------------------
 66 | // generate mode prompt init
 67 | 
 68 | "gen_mode_prompt": "<s> The explanation for the existence of seasons is",
 69 | 
 70 | // ------------------------------------
 71 | // chat mode config
 72 | 
 73 | // dialog colors (r.g.b format)
 74 | "chat_use_colors": true,               // use colors for chat
 75 | "chat_col_msg":       "250.250.250",   // messages text color
 76 | "chat_col_user":      "180.255.180",   // user text color (keyboard input)
 77 | "chat_col_assistant": "180.180.255",   // assistant answer text color
 78 | 
 79 | // forward: define what is displayed when forward user prompt
 80 | "fwd_disp_mode": 0,                    // 0: display nothing, 1:tokens list (test usage)
 81 | 
 82 | // ------------------------------------
 83 | // promp mode: define the method to generate the prompt format
 84 | // 0: use model_ident value to select templates defined in chat.c
 85 | // 1: user defined templates cm1_xxx..
 86 | // 2: use generate mode (llama1), should work with any models (chat/non-chat).
 87 | "chat_prompt_mode": 0,
 88 | 
 89 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2)
 90 | "chat_assistant_name": "Zephyr:",
 91 | "chat_user_name": "User:",
 92 | 
 93 | // ------------------------------------
 94 | // chat_prompt_mode=0 parameters
 95 | 
 96 | "cm0_sys_prompt": "",                   // todo, no sys prompt with instruct model ?
 97 | "cm0_user_prompt": "What is a pointer in C language ?",
 98 | 
 99 | // ------------------------------------
100 | // chat_prompt_mode=1 parameters (user defined template)
101 | 
102 | // https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha
103 | "cm1_sys_template": "<|system|>\n%s</s>\n",                 // %s replace cm1_sys_prompt
104 | "cm1_user_first_template": "",                              // first user template following sys prompt
105 | "cm1_user_template": "<|user|>\n%s</s>\n<|assistant|>\n",   // %s replace cm1_user_prompt
106 | "cm1_end_template": "</s>\n",                               // end of assistant reply template
107 | 
108 | "cm1_sys_prompt": "",                             // no sys prompt with Mistral ?
109 | "cm1_user_prompt": "What is sizeof(int) value in C ?",
110 | 
111 | // ------------------------------------
112 | // init prompt mode 2 (generate mode)
113 | // for mode 2 correct work:
114 | //  - ensure names coherence in user_template/sys_prompt/user_name_sw
115 | //  - no space at end of cm2_user_name_sw
116 | //  - terminate sys prompt to user name
117 | 
118 | // templates for mode 2
119 | "cm2_sys_template": "<s> %s",      // %s replaced by cm2_sys_prompt, <s> = emit bos
120 | "cm2_user_template": " %s\nBob:",  // %s = cm2_user_prompt at init and next using keyboard input string
121 | "cm2_user_name_sw": "\nUser:",     // user name switch (end template)
122 | 
123 | // llama1 generate type chat example
124 | "cm2_sys_prompt":
125 |      "Transcript of a dialog, where the User interacts with an assistant named Bob. "
126 |     +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n"
127 |     +"User: Hello Bob.\n"
128 |     +"Bob: Hello. How may I help you today?\n"
129 |     +"User:",
130 | "cm2_user_prompt": "What is sizeof(int) value in C ?"
131 | }
132 | 


--------------------------------------------------------------------------------
/src/matmul/matmul_sf16.c:
--------------------------------------------------------------------------------
  1 | #include <intrin.h>
  2 | #include "mm_hsum.h"
  3 | #include "w_types.h"
  4 | #include "matmul.h"
  5 | #include "matmul_priv.h"
  6 | 
  7 | // --------------------------------
  8 | // float SF16 conversions
  9 | 
 10 | #define SF16_CVT_MSK 0xfffc7fff
 11 | #define SF16_CVT_LSL 13
 12 | #define SF16_CVT_ADD 0x18800           // sf16 range 1.8626451e-009 to 7.9960938
 13 | 
 14 | #define SF16_CVT_MAX 8.0f              // max +/- converted value
 15 | #define SF16_ERR_MAX 0.0039062f        // max convert error for SF16_CVT_MAX value
 16 | 
 17 | #define F16_8_00 18432                 // 8.00 in float 16
 18 | #define F16_TO_SF16_MAX F16_8_00       // max F16 value that can be converted to SF16
 19 | 
 20 | // ------------------------------------------------------------------
 21 | // conversion sf16 => f32
 22 | // ------------------------------------------------------------------
 23 | 
 24 | static void cvt_sf16_to_f32_fpu(float *f32, const sf16_t *sf16, size_t ne)
 25 | {
 26 |   int *ps = (int *)f32;
 27 |   size_t i;
 28 |   for (i=0; i<ne; i++)
 29 |     ps[i] = (((short)sf16[i] & SF16_CVT_MSK) + SF16_CVT_ADD) << SF16_CVT_LSL;
 30 | }
 31 | 
 32 | // convert 4 SF16 to 4 FP32
 33 | #define CVT_4SF16(a) _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(_mm_and_si128(\
 34 |   _mm_cvtepi16_epi32(a), _mm_set1_epi32(SF16_CVT_MSK)), _mm_set1_epi32(SF16_CVT_ADD)), SF16_CVT_LSL))
 35 | 
 36 | static void cvt_sf16_to_f32_sse(float *f32, const sf16_t *sf16, size_t ne)
 37 | {
 38 |   size_t i;
 39 |   for (i=0; i!=ne; i+=4)
 40 |     _mm_store_ps(f32 + i, CVT_4SF16(_mm_loadl_epi64((__m128i *)(sf16 + i))));
 41 | }
 42 | 
 43 | // convert 8 SF16 to 8 FP32
 44 | #define CVT_8SF16(a) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(_mm256_and_si256(\
 45 |   _mm256_cvtepi16_epi32(a), _mm256_set1_epi32(SF16_CVT_MSK)), _mm256_set1_epi32(SF16_CVT_ADD)), SF16_CVT_LSL))
 46 | 
 47 | static void cvt_sf16_to_f32_avx2(float *f32, const sf16_t *sf16, size_t ne)
 48 | {
 49 |   size_t i;
 50 |   for (i=0; i!=ne; i+=8)
 51 |     _mm256_store_ps(f32 + i, CVT_8SF16(_mm_load_si128((__m128i *)(sf16 + i))));
 52 | }
 53 | 
 54 | const cvt_sf16_to_f32_t cvt_sf16_to_f32_procs[simd_n] =
 55 | {
 56 |   cvt_sf16_to_f32_fpu,
 57 |   cvt_sf16_to_f32_sse,
 58 |   NULL,
 59 |   cvt_sf16_to_f32_avx2,
 60 | };
 61 | 
 62 | // ------------------------------------------------------------------
 63 | // f32 * sf16 => f32
 64 | // ------------------------------------------------------------------
 65 | 
 66 | static void matmul_f32_sf16_fpu(float *res, const float *vec, const sf16_t *mat, int len_vec, int y_mat)
 67 | {
 68 |   const sf16_t *m, *m_end = mat + y_mat * len_vec;
 69 |   for (m=mat; m!=m_end; m+=len_vec)
 70 |   {
 71 |     float acc = 0;
 72 |     int i;
 73 |     for (i=0; i!=len_vec; i++)
 74 |     {
 75 |       unsigned int f32i = (((short)m[i] & SF16_CVT_MSK) + SF16_CVT_ADD) << SF16_CVT_LSL;
 76 |       acc += vec[i] * *(float *)&f32i;
 77 |     }
 78 |     *res++ = acc;
 79 |   }
 80 | }
 81 | 
 82 | static void matmul_f32_sf16_sse(float *res, const float *vec, const sf16_t *mat, int len_vec, int y_mat)
 83 | {
 84 |   const sf16_t *m, *m_end = mat + y_mat * len_vec;
 85 |   for (m=mat; m!=m_end; m+=len_vec)
 86 |   {
 87 |     __m128 acc0 = _mm_setzero_ps();
 88 |     __m128 acc1 = _mm_setzero_ps();
 89 |     __m128 acc2 = _mm_setzero_ps();
 90 |     __m128 acc3 = _mm_setzero_ps();
 91 |     int i;
 92 |     for (i=0; i!=len_vec; i+=16)
 93 |     {
 94 |       acc0 = _mm_fmadd_ps(_mm_load_ps(vec + i     ), CVT_4SF16(_mm_loadl_epi64((__m128i *)(m + i     ))), acc0);
 95 |       acc1 = _mm_fmadd_ps(_mm_load_ps(vec + i +  4), CVT_4SF16(_mm_loadl_epi64((__m128i *)(m + i +  4))), acc1);
 96 |       acc2 = _mm_fmadd_ps(_mm_load_ps(vec + i +  8), CVT_4SF16(_mm_loadl_epi64((__m128i *)(m + i +  8))), acc2);
 97 |       acc3 = _mm_fmadd_ps(_mm_load_ps(vec + i + 12), CVT_4SF16(_mm_loadl_epi64((__m128i *)(m + i + 12))), acc3);
 98 |     }
 99 |     *res++ = hsum_ps_sse_4x(acc0,acc1,acc2,acc3);
100 |   }
101 | }
102 | 
103 | static void matmul_f32_sf16_avx2(float *res, const float *vec, const sf16_t *mat, int len_vec, int y_mat)
104 | {
105 |   const sf16_t *m, *m_end = mat + y_mat * len_vec;
106 |   for (m=mat; m!=m_end; m+=len_vec)
107 |   {
108 |     __m256 acc = _mm256_setzero_ps();
109 |     int i;
110 |     for (i=0; i!=len_vec; i+=8)
111 |       acc = _mm256_fmadd_ps(_mm256_load_ps(vec + i), CVT_8SF16(_mm_load_si128((__m128i *)(m + i))), acc);
112 |     *res++ = hsum_ps_avx1(acc);
113 |   }
114 | }
115 | 
116 | // init functions list
117 | const matmul_f32_sf16_t matmul_f32_sf16_procs[simd_n] =
118 | {
119 |   matmul_f32_sf16_fpu,
120 |   matmul_f32_sf16_sse,
121 |   NULL,
122 |   matmul_f32_sf16_avx2,
123 | };
124 | 
125 | // ------------------------------------------------------------------
126 | // SF16 conversions
127 | // ------------------------------------------------------------------
128 | 
129 | #include "l_util.h"
130 | #include "mem_alloc.h"
131 | 
132 | // lut to convert model weights
133 | static sf16_t lut_f16_to_sf16[N_64K] = { 0 };
134 | 
135 | // f32 to sf16 (using e_ofs = 98)
136 | static int f32_to_sf16(float f32)
137 | {
138 |   int a = *(int *)&f32;
139 |   int e = (a >> 23) & 0xff;
140 |   int m = (a >> (23 - 10)) & ((1 << 10) - 1);
141 |   int f16 = m + ((e - 98) << 10);
142 |   return f16;
143 | }
144 | 
145 | // init lookup table.
146 | void init_conv_sf16(void)
147 | {
148 |   // alloc temporary AVX aligned arrays
149 |   VAR_ALLOC(f16_list, f16_t, N_64K/2);
150 |   VAR_ALLOC(f16_to_f32, float, N_64K/2);
151 |   int i;
152 | 
153 |   for (i=0; i<N_64K/2; i++)
154 |     f16_list[i] = (f16_t)i;
155 | 
156 |   matmul_procs.cvt_f16_to_f32(f16_to_f32, f16_list, N_64K/2);
157 |   if (f16_to_f32[F16_TO_SF16_MAX] != SF16_CVT_MAX)   // error in config constants
158 |     msg_error("init_conv_sf16 failed");
159 | 
160 |   for (i=0; i<N_64K/2; i++)
161 |   {
162 |     float f32 = f16_to_f32[i];
163 |     int k = f32_to_sf16(f32);
164 |     if      (k < 0)          k = 0;
165 |     else if (k >= (N_64K/2)) k = (N_64K/2)-1;
166 | 
167 |     // note: there is no rounding required, except for F16 0.0 replaced
168 |     // by +/-1.8626451e-009, all other values < SF16_TO_F16_MAX match exactly.
169 |     lut_f16_to_sf16[i] = k;
170 |     lut_f16_to_sf16[i+(N_64K/2)] = 0x8000 | k;
171 |   }
172 |   free_check(f16_to_f32);
173 |   free_check(f16_list);
174 | }
175 | 
176 | // convert buffer f16 to sf16
177 | void cvt_f16_to_sf16(sf16_t *sf16, const f16_t *f16, size_t ne)
178 | {
179 |   const f16_t *f16_end = f16 + ne;
180 |   while (f16 < f16_end)
181 |   {
182 |     f16_t _a = *f16++;
183 |     if (ABS_F16(_a) > F16_TO_SF16_MAX)    // check _a can be converted with small error
184 |       msg_error("conversion F16 to SF16 out of range");
185 |     *sf16++ = lut_f16_to_sf16[_a];
186 |   }
187 | }


--------------------------------------------------------------------------------
/run_json/run_llama2.json:
--------------------------------------------------------------------------------
  1 | // llm run parameters. note: this file must be saved in utf-8 format
  2 | {
  3 | // ------------------------------------
  4 | // model identifier
  5 | 
  6 | "model_ident": "llama2",     // model type for model specificities, refer to model.c for list
  7 | 
  8 | // ------------------------------------
  9 | // model load
 10 | 
 11 | "model_num_safetensors": 2,  // count of .safetensors files in model
 12 | "model_path": "E:/llama2/llama2-7b-chat-hf", // path to .safetensors, config.json
 13 | 
 14 | /*
 15 | // test llama2 8B pro
 16 | "model_num_safetensors": 2,
 17 | "model_path": "E:/llama2_pro/8b_pro_instruct",
 18 | */
 19 | 
 20 | // name of tokenizer
 21 | "tokenizer_name": "",        // if empty, model_path/tokenizer.json is used
 22 | 
 23 | // ------------------------------------
 24 | // transformer parameters
 25 | 
 26 | // rope value
 27 | "rope_set": 0,               // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value)
 28 | 
 29 | // ------------------------------------
 30 | // sampler parameters
 31 | 
 32 | "temperature": 0.9,          // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable)
 33 | "topp": 0.80,                // 0.01 to 0.99: max probability sum of top tokens
 34 | "topk": 40,                  // (integer) limit size of top tokens list 5..200 (0 = disable)
 35 | "topp_minp": 0.05,           // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
 36 | "topp_eos": true,            // true: limit topp list size to token with probability >= EOS
 37 | "repeat_penalty": 0.05,      // 0.0..2.0 repeat penalty (0.0 = disable)
 38 | "repeat_penalty_n": 50,      // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
 39 | "eos_amp": 0.5,              // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable)
 40 | "eos_amp_n": 150,            // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
 41 | "rand_seed": 1234,           // (integer) random seed
 42 | 
 43 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars.
 44 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñÑªº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²",
 45 | 
 46 | "test_nan_logits": false,    // test for NAN at sampling in logits (debug, problem detect)
 47 | 
 48 | // ------------------------------------
 49 | 
 50 | // model load data conversion
 51 | "cvt_sf16": false,           // convert model to sf16 (require f16 model)
 52 | "cvt_f12": false,            // convert model to float12
 53 | "cvt_f8": false,             // convert model to float8
 54 | 
 55 | // hardware parameters
 56 | "num_procs": -1,             // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances
 57 | "numa_nodes": -1,            // <=0: all auto detected. >0: max nodes to use
 58 | "simd_mode": -1,             // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2
 59 | 
 60 | // run parameters
 61 | "run_mode": 0,               // 0: generate, 1:chat
 62 | "gen_run_steps": -1,         // generate mode run steps: <=0: model max context size, >0:user value
 63 | "token_eos_str": "</s>",     // end of string token (assistant reply end)
 64 | "token_eot_str": "</s>",     // end of text token (dialog/generate end)
 65 | 
 66 | // tokens display options in chat or generate mode
 67 | "tok_disp_raw": false,       // true: display special tokens (LF,<s>, etc..)
 68 | "tok_disp_split": false,     // true: display tokens separated with ','
 69 | "tok_disp_prob": false,      // true: display sampling info (add [score + n topp] + ',')
 70 | 
 71 | // ------------------------------------
 72 | // generate mode prompt init
 73 | 
 74 | "gen_mode_prompt": "<s> The explanation for the existence of seasons is",
 75 | 
 76 | // ------------------------------------
 77 | // chat mode config
 78 | 
 79 | // dialog colors (r.g.b format)
 80 | "chat_use_colors": true,               // use colors for chat
 81 | "chat_col_msg":       "250.250.250",   // messages text color
 82 | "chat_col_user":      "180.255.180",   // user text color (keyboard input)
 83 | "chat_col_assistant": "180.180.255",   // assistant answer text color
 84 | 
 85 | // forward: define what is displayed when forward user prompt
 86 | "fwd_disp_mode": 0,                    // 0: display nothing, 1:tokens list (test usage)
 87 | 
 88 | // ------------------------------------
 89 | // promp mode: define the method to generate the prompt format
 90 | // 0: use model_ident value to select templates defined in chat.c
 91 | // 1: user defined templates cm1_xxx..
 92 | // 2: use generate mode (llama1), should work with any models (chat/non-chat).
 93 | "chat_prompt_mode": 0,
 94 | 
 95 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2)
 96 | "chat_assistant_name": "Llama2:",
 97 | "chat_user_name": "User:",
 98 | 
 99 | // ------------------------------------
100 | // chat_prompt_mode=0 parameters
101 | 
102 | "cm0_sys_prompt": "You are a chatbot who can help code.",
103 | "cm0_user_prompt": "What is sizeof(int) value in C ?",
104 | 
105 | // ------------------------------------
106 | // chat_prompt_mode=1 parameters (user defined template)
107 | 
108 | // https://huggingface.co/blog/llama2#how-to-prompt-llama-2
109 | "cm1_sys_template": "<s>[INST] <<SYS>>\n%s\n<</SYS>>\n\n", // %s replace cm1_sys_prompt
110 | "cm1_user_first_template": "%s [/INST]",                   // first user template following sys prompt
111 | "cm1_user_template": "<s>[INST] %s [/INST]",               // %s replace cm1_user_prompt
112 | "cm1_end_template": "</s>\n",                              // end of assistant reply template
113 | 
114 | "cm1_sys_prompt": "You are a chatbot who can help code.",
115 | "cm1_user_prompt": "What is sizeof(int) value in C ?",
116 | 
117 | // ------------------------------------
118 | // init prompt mode 2 (generate mode)
119 | // for mode 2 correct work:
120 | //  - ensure names coherence in user_template/sys_prompt/user_name_sw
121 | //  - no space at end of cm2_user_name_sw
122 | //  - terminate sys prompt to user name
123 | 
124 | // templates for mode 2
125 | "cm2_sys_template": "<s> %s",      // %s replaced by cm2_sys_prompt, <s> = emit bos
126 | "cm2_user_template": " %s\nBob:", // %s = cm2_user_prompt at init and next using keyboard input string
127 | "cm2_user_name_sw": "\nUser:",     // user name switch (end template)
128 | 
129 | // llama1 generate type chat example
130 | "cm2_sys_prompt":
131 |      "Transcript of a dialog, where the User interacts with an assistant named Bob. "
132 |     +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n"
133 |     +"User: Hello Bob.\n"
134 |     +"Bob: Hello. How may I help you today?\n"
135 |     +"User:",
136 | "cm2_user_prompt": "What is sizeof(int) value in C ?"
137 | }
138 | 


--------------------------------------------------------------------------------
/run_json/run_mixtral.json:
--------------------------------------------------------------------------------
  1 | // llm run parameters. note: this file must be saved in utf-8 format
  2 | {
  3 | // ------------------------------------
  4 | // model identifier
  5 | 
  6 | "model_ident": "mixtral",     // model type for model specificities, refer to model.c for list
  7 | 
  8 | // ------------------------------------
  9 | // model load
 10 | 
 11 | "model_num_safetensors": 19,                            // count of .safetensors files
 12 | // "model_path": "E:/mixtral/Mixtral-8x7B-Instruct-v0.1",  // path to .safetensors, config.json
 13 | "model_path": "D:/mixtral/Mixtral-8x7B-Instruct-v0.1",  // path to .safetensors, config.json
 14 | 
 15 | // name of tokenizer
 16 | "tokenizer_name": "",        // if empty, model_path/tokenizer.json is used
 17 | 
 18 | // ------------------------------------
 19 | // transformer parameters
 20 | 
 21 | // rope value
 22 | "rope_set": 0,               // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value)
 23 | 
 24 | // ------------------------------------
 25 | // sampler parameters
 26 | 
 27 | "temperature": 0.7,          // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable)
 28 | "topp": 0.80,                // 0.01 to 0.99: max probability sum of top tokens
 29 | "topk": 40,                  // (integer) limit size of top tokens list 5..200 (0 = disable)
 30 | "topp_minp": 0.05,           // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
 31 | "topp_eos": true,            // true: limit topp list size to token with probability >= EOS
 32 | "repeat_penalty": 0.05,      // 0.0..2.0 repeat penalty (0.0 = disable)
 33 | "repeat_penalty_n": 50,      // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
 34 | "eos_amp": 0.0,              // 0.0 to 2.0 amplify eos probability when more than eos_inc_n tokens generated. (0 = disable)
 35 | "eos_amp_n": 150,            // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
 36 | "rand_seed": 1234,           // (integer) random seed
 37 | 
 38 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars.
 39 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñÑªº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²",
 40 | 
 41 | "test_nan_logits": false,    // test for NAN at sampling in logits (debug, problem detect)
 42 | 
 43 | // ------------------------------------
 44 | 
 45 | // model load data conversion
 46 | "cvt_sf16": false,           // convert model to sf16 (require f16 model)
 47 | "cvt_f12": true,            // convert model to float12
 48 | "cvt_f8": false,             // convert model to float8  (required on 64Gb mem)
 49 | 
 50 | // hardware parameters
 51 | "num_procs": 22,             // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances
 52 | "numa_nodes": -1,            // <=0: all auto detected. >0: max nodes to use
 53 | "simd_mode": -1,             // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2
 54 | 
 55 | // run parameters
 56 | "run_mode": 0,               // 0: generate, 1:chat
 57 | "gen_run_steps": -1,         // generate mode run steps: <=0: model max context size, >0:user value
 58 | "token_eos_str": "</s>",     // end of string token (assistant reply end)
 59 | "token_eot_str": "</s>",     // end of text token (dialog/generate end)
 60 | 
 61 | // tokens display options in chat or generate mode
 62 | "tok_disp_raw": false,       // true: display special tokens (LF,<s>, etc..)
 63 | "tok_disp_split": false,     // true: display tokens separated with ','
 64 | "tok_disp_prob": false,      // true: display sampling info (add [score + n topp] + ',')
 65 | 
 66 | // ------------------------------------
 67 | // generate mode prompt init
 68 | 
 69 | "gen_mode_prompt": "<s> The explanation for the existence of seasons is",
 70 | 
 71 | // ------------------------------------
 72 | // chat mode config
 73 | 
 74 | // dialog colors (r.g.b format)
 75 | "chat_use_colors": true,               // use colors for chat
 76 | "chat_col_msg":       "250.250.250",   // messages text color
 77 | "chat_col_user":      "180.255.180",   // user text color (keyboard input)
 78 | "chat_col_assistant": "180.180.255",   // assistant answer text color
 79 | 
 80 | // forward: define what is displayed when forward user prompt
 81 | "fwd_disp_mode": 0,                    // 0: display nothing, 1:tokens list (test usage)
 82 | 
 83 | // ------------------------------------
 84 | // promp mode: define the method to generate the prompt format
 85 | // 0: use model_ident value to select templates defined in chat.c
 86 | // 1: user defined templates cm1_xxx..
 87 | // 2: use generate mode (llama1), should work with any models (chat/non-chat).
 88 | "chat_prompt_mode": 0,
 89 | 
 90 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2)
 91 | "chat_assistant_name": "Mixtral:",
 92 | "chat_user_name": "User:",
 93 | 
 94 | // ------------------------------------
 95 | // chat_prompt_mode=0 parameters
 96 | 
 97 | "cm0_sys_prompt": "",                   // todo, no sys prompt with instruct model ?
 98 | "cm0_user_prompt": "What is a pointer in C language ?",
 99 | 
100 | // ------------------------------------
101 | // chat_prompt_mode=1 parameters (user defined template)
102 | 
103 | // https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1#instruction-format
104 | "cm1_sys_template": "",                              // %s replace cm1_sys_prompt
105 | "cm1_user_first_template": "<s> [INST] %s [/INST]",  // first user template following sys prompt
106 | "cm1_user_template": "[INST] %s [/INST]",            // %s replace cm1_user_prompt
107 | "cm1_end_template": "</s>\n",                        // end of assistant reply template
108 | 
109 | "cm1_sys_prompt": "",
110 | "cm1_user_prompt": "What is sizeof(int) value in C ?",
111 | 
112 | // ------------------------------------
113 | // init prompt mode 2 (generate mode)
114 | // for mode 2 correct work:
115 | //  - ensure names coherence in user_template/sys_prompt/user_name_sw
116 | //  - no space at end of cm2_user_name_sw
117 | //  - terminate sys prompt to user name
118 | 
119 | // templates for mode 2
120 | "cm2_sys_template": "<s> %s",      // %s replaced by cm2_sys_prompt, <s> = emit bos
121 | "cm2_user_template": " %s\nBob:",  // %s = cm2_user_prompt at init and next using keyboard input string
122 | "cm2_user_name_sw": "\nUser:",     // user name switch (end template)
123 | 
124 | // llama1 generate type chat example
125 | "cm2_sys_prompt":
126 |      "Transcript of a dialog, where the User interacts with an assistant named Bob. "
127 |     +"Bob is good at computer programming and never fails to respond to user requests accurately.\n\n"
128 |     +"User: Hello Bob.\n"
129 |     +"Bob: Hello. How may I help you today?\n"
130 |     +"User:",
131 | "cm2_user_prompt": "What is sizeof(int) value in C ?"
132 | }
133 | 


--------------------------------------------------------------------------------
/run_json/run_vigogne2.json:
--------------------------------------------------------------------------------
  1 | // llm run parameters. note: this file must be saved in utf-8 format
  2 | {
  3 | // ------------------------------------
  4 | // model identifier
  5 | 
  6 | "model_ident": "vigogne2",   // model type for model specificities, refer to model.c for list
  7 | 
  8 | // ------------------------------------
  9 | // model load
 10 | 
 11 | /*
 12 | "model_num_safetensors": 7,
 13 | "model_path": "E:/vigogne/vigogne-33b-instruct",
 14 | */
 15 | 
 16 | "model_num_safetensors": 14,    // count of .safetensors files
 17 | "model_path": "E:/vigogne/vigogne-2-13b-instruct",
 18 | 
 19 | // name of tokenizer
 20 | "tokenizer_name": "",        // if empty, model_path/tokenizer.json is used
 21 | 
 22 | // ------------------------------------
 23 | // transformer parameters
 24 | 
 25 | // rope value
 26 | "rope_set": 10000.0,         // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value)
 27 | 
 28 | // ------------------------------------
 29 | // sampler parameters
 30 | 
 31 | "temperature": 0.85,         // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable)
 32 | "topp": 0.70,                // 0.01 to 0.99: max probability sum of top tokens
 33 | "topk": 25,                  // (integer) limit size of top tokens list 5..200 (0 = disable)
 34 | "topp_minp": 0.05,           // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
 35 | "topp_eos": false,           // true: limit topp list size to token with probability >= EOS
 36 | "repeat_penalty": 0.0,       // 0.0..2.0 repeat penalty (0.0 = disable)
 37 | "repeat_penalty_n": 50,      // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
 38 | "eos_amp": 0.0,              // 0.0 to 2.0 amplify eos probability when more than eos_amp_n tokens generated. (0 = disable)
 39 | "eos_amp_n": 150,            // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
 40 | "rand_seed": 1234,           // (integer) random seed
 41 | 
 42 | // (optional) if ch_restrict defined, sample only tokens that contain ascii chars and sample_restrict chars.
 43 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñÑªº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²",
 44 | 
 45 | "test_nan_logits": false,    // test for NAN at sampling in logits (debug, problem detect)
 46 | 
 47 | // ------------------------------------
 48 | 
 49 | // model load data conversion
 50 | "cvt_sf16": false,           // convert model to sf16 (require f16 model)
 51 | "cvt_f12": false,            // convert model to float12
 52 | "cvt_f8": false,             // convert model to float8 (cannot with tinyllama)
 53 | 
 54 | // hardware parameters
 55 | "num_procs": 12,             // <=0: max auto detected, >0: user value. note: max procs do not always produce best performances
 56 | "numa_nodes": -1,            // <=0: all auto detected. >0: max nodes to use
 57 | "simd_mode": -1,             // <=0: max auto detect, 0:fpu 1:sse 2:avx, 3:avx2
 58 | 
 59 | // run parameters
 60 | "run_mode": 0,               // 0: generate, 1:chat
 61 | "gen_run_steps": -1,         // generate mode run steps: <=0: model max context size, >0:user value
 62 | "token_eos_str": "</s>",     // end of string token (assistant reply end)
 63 | "token_eot_str": "</s>",     // end of text token (dialog/generate end)
 64 | 
 65 | // tokens display options in chat or generate mode
 66 | "tok_disp_raw": false,       // true: display special tokens (LF,<s>, etc..)
 67 | "tok_disp_split": false,     // true: display tokens separated with ','
 68 | "tok_disp_prob": false,      // true: display sampling info (add [score + n topp] + ',')
 69 | 
 70 | // ------------------------------------
 71 | // generate mode prompt init
 72 | 
 73 | "gen_mode_prompt": "<s> L'explication de l'existence de saisons est",
 74 | 
 75 | // ------------------------------------
 76 | // chat mode config
 77 | 
 78 | // dialog colors (r.g.b format)
 79 | "chat_use_colors": true,               // use colors for chat
 80 | "chat_col_msg":       "250.250.250",   // messages text color
 81 | "chat_col_user":      "180.255.180",   // user text color (keyboard input)
 82 | "chat_col_assistant": "180.180.255",   // assistant answer text color
 83 | 
 84 | // forward: define what is displayed when forward user prompt
 85 | "fwd_disp_mode": 0,                    // 0: display nothing, 1:tokens list (test usage)
 86 | 
 87 | // ------------------------------------
 88 | // promp mode: define the method to generate the prompt format
 89 | // 0: use model_ident value to select templates defined in chat.c
 90 | // 1: user defined templates cm1_xxx..
 91 | // 2: use generate mode (llama1), should work with any models (chat/non-chat).
 92 | "chat_prompt_mode": 0,
 93 | 
 94 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2)
 95 | "chat_assistant_name": "Vigogne:",
 96 | "chat_user_name": "User:",
 97 | 
 98 | // ------------------------------------
 99 | // chat_prompt_mode=0 parameters
100 | 
101 | "cm0_sys_prompt": "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.",
102 | "cm0_user_prompt": "Bonjour ! Comment ça va aujourd'hui ?",
103 | 
104 | // ------------------------------------
105 | // chat_prompt_mode=1 parameters (user defined template)
106 | /*
107 | // format for 7B, see https://huggingface.co/bofenghuang/vigogne-2-7b-chat
108 | "cm1_sys_template": "<s><|system|>: %s\n",
109 | "cm1_user_first_template": "",
110 | "cm1_user_template": "<|user|>: %s\n<|assistant|>:",
111 | "cm1_end_template": "</s>\n",        // end of assistant reply
112 | */
113 | // format for 70B, see https://huggingface.co/bofenghuang/vigogne-2-70b-chat
114 | "cm1_sys_template": "<s>[INST] <<SYS>>\n%s\n<</SYS>>\n\n",
115 | "cm1_user_first_template": "%s [/INST]",    // first user template
116 | "cm1_user_template": "[INST] %s [/INST]",
117 | "cm1_end_template": "</s>\n",
118 | 
119 | "cm1_sys_prompt": "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.",
120 | "cm1_user_prompt": "Bonjour ! Comment ça va aujourd'hui ?",
121 | 
122 | // ------------------------------------
123 | // init prompt mode 2 (generate mode)
124 | // for mode 2 correct work:
125 | //  - ensure names coherence in user_template/sys_prompt/user_name_sw
126 | //  - no space at end of cm2_user_name_sw
127 | //  - terminate sys prompt to user name
128 | 
129 | // templates for mode 2
130 | "cm2_sys_template": "<s> %s",      // %s replaced by cm2_sys_prompt, <s> = emit bos
131 | "cm2_user_template": " %s\nTiny:", // %s = cm2_user_prompt at init and next using keyboard input string
132 | "cm2_user_name_sw": "\nUser:",     // user name switch (end template)
133 | 
134 | // llama1 generate type chat example
135 | "cm2_sys_prompt":
136 |      "Transcript of a dialog, where the User interacts with an assistant named Tiny. "
137 |     +"Tiny is good at computer programming and never fails to respond to user requests accurately.\n\n"
138 |     +"User: Hello Tiny.\n"
139 |     +"Tiny: Hello. How may I help you today?\n"
140 |     +"User:",
141 | "cm2_user_prompt": "What is sizeof(int) value in C ?"
142 | }
143 | 


--------------------------------------------------------------------------------
/src/utils/utf8.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <string.h>
  3 | #include "utf8.h"
  4 | 
  5 | // ------------------------------------
  6 | // UTF8
  7 | 
  8 | // encode one char to utf8, return length, 0 if error
  9 | int utf8_char_encode(char *s, int code)
 10 | {
 11 |   // 7 bits  0bbb.bbbb
 12 |   if (code < (1 << 7))
 13 |   {
 14 |     s[0] = code;
 15 |     return 1;
 16 |   }
 17 |   // 5 + 6 bits  110b.bbbb 10bb.bbbb
 18 |   if (code < (1 << (5 + 6)))
 19 |   {
 20 |     s[0] = 0xc0 | (code >> 6);
 21 |     s[1] = 0x80 | (code & 0x3f);
 22 |     return 2;
 23 |   }
 24 |   // 4 + 6 + 6 bits  1110.bbbb 10bb.bbbb 10bb.bbbb
 25 |   if (code < (1 << (4 + 6 + 6)))
 26 |   {
 27 |     s[0] = 0xe0 | (code >> 12);
 28 |     s[1] = 0x80 | ((code >> 6) & 0x3f);
 29 |     s[2] = 0x80 | (code & 0x3f);
 30 |     return 3;
 31 |   }
 32 |   // 3 + 6 + 6 + 6 bits  1111.0bbb 10bb.bbbb 10bb.bbbb 10bb.bbbb
 33 |   if (code < (1 << (3 + 6 + 6 + 6)))
 34 |   {
 35 |     s[0] = 0xf0 | (code >> 18);
 36 |     s[1] = 0x80 | ((code >> 12) & 0x3f);
 37 |     s[2] = 0x80 | ((code >> 6) & 0x3f);
 38 |     s[3] = 0x80 | (code & 0x3f);
 39 |     return 4;
 40 |   }
 41 |   // code too big to encode
 42 |   return 0;
 43 | }
 44 | 
 45 | // return encoded value and length
 46 | int utf8_char_decode(const char *s, int *code)
 47 | {
 48 |   // 7 bits  0bbb.bbbb
 49 |   if (!(s[0] & 0x80))
 50 |   {
 51 |     *code = s[0];
 52 |     return 1;
 53 |   }
 54 |   // 5 + 6 bits  110b.bbbb 10bb.bbbb
 55 |   if ((s[0] & 0xe0) == 0xc0)
 56 |   {
 57 |     *code = ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
 58 |     if ((s[1] & 0xc0) == 0x80)   
 59 |       return 2;
 60 |   }
 61 |   // 4 + 6 + 6 bits  1110.bbbb 10bb.bbbb 10bb.bbbb
 62 |   if ((s[0] & 0xf0) == 0xe0)
 63 |   {
 64 |     *code = ((s[0] & 0xf) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
 65 |     if (((s[1] & 0xc0) == 0x80) && ((s[2] & 0xc0) == 0x80))
 66 |       return 3;
 67 |   }
 68 |   // 3 + 6 + 6 + 6 bits  1111.0bbb 10bb.bbbb 10bb.bbbb 10bb.bbbb
 69 |   if ((s[0] & 0xf8) == 0xf0)
 70 |   {
 71 |     *code = ((s[0] & 0x7) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
 72 |     if (((s[1] & 0xc0) == 0x80) && ((s[2] & 0xc0) == 0x80) && ((s[3] & 0xc0) == 0x80))
 73 |       return 4;
 74 |   }
 75 |   // invalid encoding
 76 |   *code = 0;
 77 |   return 0;
 78 | }
 79 | 
 80 | // return encoded length
 81 | int utf8_char_len(const char *s)
 82 | {
 83 |   if (!(s[0] & 0x80))
 84 |     return 1;
 85 |   if ((s[0] & 0xe0) == 0xc0)
 86 |   {
 87 |     if ((s[1] & 0xc0) == 0x80)
 88 |       return 2;
 89 |   }
 90 |   else
 91 |   if ((s[0] & 0xf0) == 0xe0)
 92 |   {
 93 |     if (((s[1] & 0xc0) == 0x80) && ((s[2] & 0xc0) == 0x80))
 94 |       return 3;
 95 |   }
 96 |   else
 97 |   if ((s[0] & 0xf8) == 0xf0)
 98 |   {
 99 |     if (((s[1] & 0xc0) == 0x80) && ((s[2] & 0xc0) == 0x80) && ((s[3] & 0xc0) == 0x80))
100 |       return 4;
101 |   }
102 |   // invalid encoding
103 |   return 0;
104 | }
105 | 
106 | // return count of utf8 char coded in string, return 0 if coding error found
107 | int utf8_get_char_count(const char *s)
108 | {
109 |   int char_count = 0;
110 |   while (*s)
111 |   {
112 |     int l = utf8_char_len(s);
113 |     if (!l) 
114 |       return 0;                  // encoding error
115 |     s += l;
116 |     char_count++;
117 |   }
118 |   return char_count;
119 | }
120 | 
121 | // convert text cr + lf or single lf to single cr
122 | bool utf8_cvt_crlf_to_cr(char *s)
123 | {
124 |   char *d = s;
125 |   while (*s)
126 |   {
127 |     int l = utf8_char_len(s);
128 |     if (!l)
129 |       break;                     // encoding error
130 | 
131 |     if (*s == 0x0d)              // CR + LF to CR
132 |     {
133 |       *d++ = 0x0d;
134 |       s += (s[1] == 0x0a) ? 2 : 1;
135 |     }
136 |     else
137 |     if (*s == 0x0a)              // LF to CR
138 |     {
139 |       *d++ = 0x0d;
140 |       s++;
141 |     }
142 |     else
143 |     while (l--)
144 |       *d++ = *s++;
145 |   }
146 |   *d = 0;
147 |   return !*s;
148 | }
149 | 
150 | #if 0
151 | // -----------------------------------------------
152 | // test code
153 | // https://fr.wikipedia.org/wiki/UTF-8
154 | #include <stdio.h>
155 | 
156 | typedef struct
157 | {
158 |   int code;
159 |   char *s;
160 |   int len;
161 | } ut_t;
162 | 
163 | // wiki page examples
164 | const ut_t u_list[] = {
165 |   { 159      , "\xC2\x9F", 2 },
166 |   { 160      , "\xC2\xA0", 2 },
167 |   { 191      , "\xC2\xBF", 2 },
168 |   { 192      , "\xC3\x80", 2 },
169 |   { 233      , "\xC3\xA9", 2 },
170 |   { 2047     , "\xDF\xBF", 2 },
171 |   { 2048     , "\xE0\xA0\x80", 3 },
172 |   { 8364     , "\xE2\x82\xAC", 3 },
173 |   { 55295 	 , "\xED\x9F\xBF", 3 },
174 |   { 57344    , "\xEE\x80\x80", 3 },
175 |   { 63743    , "\xEF\xA3\xBF", 3 },
176 |   { 63744    , "\xEF\xA4\x80", 3 },
177 |   { 64975    , "\xEF\xB7\x8F", 3 },
178 |   { 64976    , "\xEF\xB7\x90", 3 },
179 |   { 65007    , "\xEF\xB7\xAF", 3 },
180 |   { 65008    , "\xEF\xB7\xB0", 3 },
181 |   { 65533    , "\xEF\xBF\xBD", 3 },
182 |   { 65534    , "\xEF\xBF\xBE", 3 },
183 |   { 65535    , "\xEF\xBF\xBF", 3 },
184 |   { 65536    , "\xF0\x90\x80\x80", 4 },
185 |   { 119070   , "\xF0\x9D\x84\x9E", 4 },
186 |   { 131069   , "\xF0\x9F\xBF\xBD", 4 },
187 |   { 131070   , "\xF0\x9F\xBF\xBE", 4 },
188 |   { 131071   , "\xF0\x9F\xBF\xBF", 4 },
189 |   { 131072   , "\xF0\xA0\x80\x80", 4 },
190 |   { 196605   , "\xF0\xAF\xBF\xBD", 4 },
191 |   { 196606   , "\xF0\xAF\xBF\xBE", 4 },
192 |   { 196607   , "\xF0\xAF\xBF\xBF", 4 },
193 |   { 196608   , "\xF0\xB0\x80\x80", 4 },
194 |   { 262141   , "\xF0\xBF\xBF\xBD", 4 },
195 |   { 262142   , "\xF0\xBF\xBF\xBE", 4 },
196 |   { 262143   , "\xF0\xBF\xBF\xBF", 4 },
197 |   { 917504   , "\xF3\xA0\x80\x80", 4 },
198 |   { 983037   , "\xF3\xAF\xBF\xBD", 4 },
199 |   { 983038   , "\xF3\xAF\xBF\xBE", 4 },
200 |   { 983039   , "\xF3\xAF\xBF\xBF", 4 },
201 |   { 983040   , "\xF3\xB0\x80\x80", 4 },
202 |   { 1048573  , "\xF3\xBF\xBF\xBD", 4 },
203 |   { 1048574  , "\xF3\xBF\xBF\xBE", 4 },
204 |   { 1048575  , "\xF3\xBF\xBF\xBF", 4 },
205 |   { 1048576  , "\xF4\x80\x80\x80", 4 },
206 |   { 1114109  , "\xF4\x8F\xBF\xBD", 4 },
207 |   { 1114110  , "\xF4\x8F\xBF\xBE", 4 },
208 |   { 1114111  , "\xF4\x8F\xBF\xBF", 4 },
209 |   { 0, NULL, 0 } };
210 | 
211 | int main(void)
212 | {
213 |   int i;
214 |   for (i=0; u_list[i].s; i++)
215 |   {
216 |     const ut_t *u = &u_list[i];
217 |     int l, code;
218 |     l = utf8_char_len(u->s);
219 |     if (l != u->len)
220 |       break;
221 |     l = utf8_char_decode(u->s, &code);
222 |     if (l != u->len)
223 |       break;
224 |     if (code != u->code)
225 |       break;
226 |   }
227 |   if (u_list[i].s)
228 |     printf("decode failed.\n");
229 | 
230 |   // check encode
231 |   for (i=0; i < (1 << (3 + 6 + 6 + 6)); i++)
232 |   {
233 |     char s[8];
234 |     int code;
235 |     int le = utf8_char_encode(s, i);
236 |     int ld = utf8_char_decode(s, &code);
237 |     if (!le || (ld != le) || (code != i))
238 |     {
239 |       printf("encode failed.\n");
240 |       break;
241 |     }
242 |   }
243 | }
244 | #endif


--------------------------------------------------------------------------------
/run_json/run_llama3.json:
--------------------------------------------------------------------------------
  1 | // llm run parameters. note: this file must be saved in utf-8 format
  2 | {
  3 | // ------------------------------------
  4 | // model identifier
  5 | 
  6 | "model_ident": "llama3",        // define model type for model specificities, refer to model.c for list
  7 | 
  8 | // ------------------------------------
  9 | // model load
 10 | 
 11 | "model_num_safetensors": 4,                   // count of .safetensors files
 12 | "model_path": "D:/llama3_st/8b-instruct",     // path to .safetensors, config.json
 13 | 
 14 | // name of tokenizer
 15 | "tokenizer_name": "",        // if empty: model_path/tokenizer.json used else define full path+name
 16 | 
 17 | // ------------------------------------
 18 | // transformer parameters
 19 | 
 20 | // rope value
 21 | "rope_set": 0,               // 0:use config.json or .safetensors data, >0:user value ex:10000.0 (llama2 value)
 22 | 
 23 | // ------------------------------------
 24 | // sampler parameters
 25 | "temperature": 1.0,          // 0.0 to 2.0: 0.0:greedy decoding 2.0:maximum creativity (1.0 = disable)
 26 | "topp": 0.65,                // 0.01 to 0.99: max probability sum of top tokens
 27 | "topk": 40,                  // (integer) limit size of top tokens list 5..200 (0 = disable)
 28 | "topp_minp": 0.05,           // 0.0 to 1.0: (experimental) !=0: min token probability required to continue generate if EOS contained in topp list
 29 | "topp_eos": false,           // true: limit topp list size to token with probability >= EOS
 30 | "repeat_penalty": 0.0,       // 0.0..2.0 repeat penalty (0.0 = disable)
 31 | "repeat_penalty_n": 0,       // (integer) count of last generated tokens used to apply repeat penalty (0 = disable, min 10)
 32 | "eos_amp": 0.5,              // 0.0 to 2.0 amplify eos probability when more than eos_amp_n tokens generated. (0 = disable)
 33 | "eos_amp_n": 250,            // (integer) count of tokens generated before starting eos_amp influence (0 = disable, min 10)
 34 | "rand_seed": 1234,           // (integer) random seed
 35 | 
 36 | // (optional) if ch_restrict defined, allow to sample only tokens that contain ascii chars and utf8 chars contained in sample_restrict string.
 37 | // "ch_restrict": "ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñÑªº¿⌐¬½¼¡«»αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²",
 38 | 
 39 | "test_nan_logits": false,    // test for NAN at sampling in logits result
 40 | 
 41 | // ------------------------------------
 42 | 
 43 | // model load data conversion
 44 | "cvt_sf16": false,           // convert model to sf16 (require f16 model)
 45 | "cvt_f12": false,            // convert model to float12 (should be possible with all models, require all weights <= 4.0)
 46 | "cvt_f8": false,             // convert model to float8 (not possible with some models, require all weights <= 2.0)
 47 | 
 48 | // hardware parameters
 49 | "num_procs": 12,             // -1: max auto detected (may be adjusted), >0: user value. note: max procs do not always produce best performances
 50 | "numa_nodes": -1,            // -1: use all detected. 0: skip numa specific code, >0: max nodes to use
 51 | "simd_mode": -1,             // -1: max detect, 0:fpu 1:sse 2:avx, 3:avx2
 52 | 
 53 | // run parameters
 54 | "run_mode": 0,                         // 0: generate, 1:chat
 55 | "gen_run_steps": -1,                   // generate mode run steps: <=0: model max context size, >0:user value
 56 | "token_eos_str": "<|eot_id|>",         // end of string token (assistant reply end)
 57 | "token_eot_str": "<|end_of_text|>",    // end of text token (dialog/generate end)
 58 | 
 59 | // tokens display options in chat or generate mode
 60 | "tok_disp_raw": false,       // true: display control/byte for tokens (LF,<s>, etc)
 61 | "tok_disp_split":false,      // true: display token list separated with ','
 62 | "tok_disp_prob": false,      // true: display sampling info (add [score + n topp] + ',')
 63 | 
 64 | // ------------------------------------
 65 | // generate mode config
 66 | 
 67 | "gen_mode_prompt": "<|begin_of_text|>The explanation for the existence of seasons is",
 68 | 
 69 | // ------------------------------------
 70 | // chat mode config
 71 | 
 72 | // dialog colors (r.g.b format)
 73 | "chat_use_colors": true,               // use colors for chat
 74 | "chat_col_msg":       "250.250.250",   // messages text color
 75 | "chat_col_user":      "180.255.180",   // user text color (keyboard input)
 76 | "chat_col_assistant": "180.180.255",   // assistant answer text color
 77 | 
 78 | // forward: define what is displayed when forward user prompt
 79 | "fwd_disp_mode": 0,                    // 0: display nothing, 1:tokens list (test/check mode)
 80 | 
 81 | // ------------------------------------
 82 | // promp mode: define the method to generate the prompt format
 83 | // 0: use model_ident value to select templates defined in chat.c
 84 | // 1: user defined templates cm1_xxx..
 85 | // 2: use generate mode (llama1), should work with any models (chat/non-chat).
 86 | "chat_prompt_mode": 0,                 // 0: use model_ident value to define templates. 1, user defined template, 2 use generate mode (llama1)
 87 | 
 88 | // prompt names displayed for assistant and user in mode 0 and 1 (defined by sys_prompt in mode 2)
 89 | "chat_assistant_name": "LLama3:",
 90 | "chat_user_name": "User:",
 91 | 
 92 | // ------------------------------------
 93 | // chat_prompt_mode=0 parameters
 94 | 
 95 | "cm0_sys_prompt": "You are a helpful AI assistant for travel tips and recommendations",
 96 | "cm0_user_prompt": "What is France's capital?",
 97 | 
 98 | // ------------------------------------
 99 | // chat_prompt_mode=1 parameters (user defined template)
100 | 
101 | // https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-3/
102 | // https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1
103 | "cm1_sys_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n%s<|eot_id|>",
104 | "cm1_user_first_template": "",
105 | "cm1_user_template": "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
106 | "cm1_end_template": "<|eot_id|>\n",
107 | 
108 | "cm1_sys_prompt": "You are a helpful assistant.",
109 | "cm1_user_prompt": "Explain shortly what is a pointer in C language ?",
110 | 
111 | // ------------------------------------
112 | // init prompt mode 2 (generate mode)
113 | // for mode 2 correct work:
114 | //  - ensure defined names coherences in user_template/sys_prompt/user_name_sw
115 | //  - ensure no space at end of user_name_sw
116 | //  - terminate sys prompt to user name
117 | 
118 | // templates required for mode 1 or 2
119 | "cm2_sys_template": "<|begin_of_text|>%s",     // %s replaced by cm2_sys_prompt, <s> = emit bos
120 | "cm2_user_template": " %s\nBob:",              // %s = cm2_user_prompt at init and next with keyboard input string
121 | // user name switch, required for mode 2 only, string detect switch to user in sys prompt (size 5..10 char, no space at end)
122 | "cm2_user_name_sw": "\nUser:",
123 | 
124 | // here is llama.cpp project llama1 chat example
125 | "cm2_sys_prompt":
126 |      "Transcript of a dialog, where the User interacts with an Assistant named Bob. "
127 |     +"Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\n\n"
128 |     +"User: Hello, Bob.\n"
129 |     +"Bob: Hello. How may I help you today?\n"
130 |     +"User:",
131 | "cm2_user_prompt": "Do you know what is the first prime number greater than 15 ?"
132 | }
133 | 


--------------------------------------------------------------------------------