├── logo.png
├── user.png
├── assistant.png
├── smoLM135M.gif
├── models
└── yourModelGGUF here.md
├── instructions.txt
├── README.md
├── st-SmoL135M-llamafile.py
└── models_details.txt
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/135M-you-cannot-go-Smaller/main/logo.png
--------------------------------------------------------------------------------
/user.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/135M-you-cannot-go-Smaller/main/user.png
--------------------------------------------------------------------------------
/assistant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/135M-you-cannot-go-Smaller/main/assistant.png
--------------------------------------------------------------------------------
/smoLM135M.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/135M-you-cannot-go-Smaller/main/smoLM135M.gif
--------------------------------------------------------------------------------
/models/yourModelGGUF here.md:
--------------------------------------------------------------------------------
1 | Download the model in this subfolder
2 |
3 | ```
4 | wget https://huggingface.co/MaziyarPanahi/SmolLM-135M-Instruct-GGUF/resolve/main/SmolLM-135M-Instruct.Q8_0.gguf -OutFile SmolLM-135M-Instruct.Q8_0.gguf
5 | ```
6 |
7 |
--------------------------------------------------------------------------------
/instructions.txt:
--------------------------------------------------------------------------------
1 | mkdir HFSmol_LM
2 |
3 | cd HFSmol_LM
4 | python -m venv venv
5 |
6 | venv\Scripts\activate
7 |
8 | deactivate
9 |
10 | pip install streamlit==1.36.0 openai tiktoken
11 |
12 |
13 | Download llamafile
14 | wget https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.12/llamafile-0.8.12 -OutFile llamafile-0.8.12.exe
15 |
16 |
17 | download the weights
18 | mkdir models
19 | cd models
20 |
21 | wget https://huggingface.co/MaziyarPanahi/SmolLM-135M-Instruct-GGUF/resolve/main/SmolLM-135M-Instruct.Q8_0.gguf -OutFile SmolLM-135M-Instruct.Q8_0.gguf
22 |
23 | wget https://huggingface.co/MaziyarPanahi/SmolLM-360M-Instruct-GGUF/resolve/main/SmolLM-360M-Instruct.Q8_0.gguf -OutFile SmolLM-360M-Instruct.Q8_0.gguf
24 |
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # 135M-you-cannot-go-Smaller
4 | Repo of the code from the Medium article 135M: you cannot go Smaller
5 |
6 |
7 |
8 | it is tested on Windows 11, with 16 GB RAM. Python 3.11+
9 |
10 | #### Create VENV and Install dependencies
11 | ```
12 | mkdir HFSmol_LM
13 |
14 | cd HFSmol_LM
15 | python -m venv venv
16 |
17 | venv\Scripts\activate
18 |
19 | deactivate
20 |
21 | pip install streamlit==1.36.0 openai tiktoken
22 | ```
23 |
24 | #### Dowmnload Llamafile
25 | ```
26 | wget https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.12/llamafile-0.8.12 -OutFile llamafile-0.8.12.exe
27 | ```
28 |
29 | #### Download the model in the `models` subfolder
30 | ```
31 | mkdir models
32 | cd models
33 |
34 | wget https://huggingface.co/MaziyarPanahi/SmolLM-135M-Instruct-GGUF/resolve/main/SmolLM-135M-Instruct.Q8_0.gguf -OutFile SmolLM-135M-Instruct.Q8_0.gguf
35 | ```
36 |
37 | ### How to run
38 | 1. in one terminal window, even withouth the venv activated run
39 | ```
40 | .\llamafile-0.8.12.exe --nobrowser --host 0.0.0.0 -m .\models\SmolLM-135M-Instruct.Q8_0.gguf -c 2048
41 | ```
42 | 2. in another terminal window, with the `venv` active, run
43 | ```
44 | streamlit run st-SmoL135M-llamafile.py
45 | ```
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/st-SmoL135M-llamafile.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from openai import OpenAI
3 | from time import sleep
4 | import datetime
5 | import random
6 | import string
7 | import tiktoken
8 |
9 | # for counting the tokens in the prompt and in the result
10 | #context_count = len(encoding.encode(yourtext))
11 | encoding = tiktoken.get_encoding("r50k_base")
12 |
13 | modelname = 'SmolLM-135M-Instruct'
14 | modelfile = 'models\SmolLM-135M-Instruct.Q8_0.gguf'
15 |
16 | # function to write the content in the log file
17 | def writehistory(filename,text):
18 | with open(filename, 'a', encoding='utf-8') as f:
19 | f.write(text)
20 | f.write('\n')
21 | f.close()
22 |
23 | #AVATARS 👷🐦 🥶🌀
24 | av_us = 'user.png' #"🦖" #A single emoji, e.g. "🧑💻", "🤖", "🦖". Shortcodes are not supported.
25 | av_ass = 'assistant.png'
26 |
27 | # Set the webpage title
28 | st.set_page_config(
29 | page_title=f"Your LocalGPT with 🌟 {modelname}",
30 | page_icon="🌟",
31 | layout="wide")
32 |
33 | # Create a header element
34 | mytitle = '# Your own LocalGPT 🌟'
35 | st.markdown(mytitle, unsafe_allow_html=True)
36 | st.markdown('### SmolLM-135M-Instruct, 2048 tokens context window')
37 | # function to generate random alphanumeric sequence for the filename
38 | def genRANstring(n):
39 | """
40 | n = int number of char to randomize
41 | """
42 | N = n
43 | res = ''.join(random.choices(string.ascii_uppercase +
44 | string.digits, k=N))
45 | return res
46 |
47 | # create THE SESSIoN STATES
48 | if "logfilename" not in st.session_state:
49 | ## Logger file
50 | logfile = f'{genRANstring(5)}_log.txt'
51 | st.session_state.logfilename = logfile
52 | #Write in the history the first 2 sessions
53 | writehistory(st.session_state.logfilename,f'{str(datetime.datetime.now())}\n\nYour own LocalGPT with 🌀 {modelname}\n---\n🧠🫡: You are a helpful assistant.')
54 | writehistory(st.session_state.logfilename,f'🌀: How may I help you today?')
55 |
56 | if "repeat" not in st.session_state:
57 | st.session_state.repeat = 1.35
58 |
59 | if "temperature" not in st.session_state:
60 | st.session_state.temperature = 0.1
61 |
62 | if "maxlength" not in st.session_state:
63 | st.session_state.maxlength = 500
64 |
65 | # Point to the local server
66 | # Change localhost with the IP ADDRESS of the computer acting as a server if not the local machine
67 | # itmay be something like "http://192.168.1.52:8000/v1"
68 | client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed", organization='SelectedModel')
69 |
70 | # CREATE THE SIDEBAR
71 | with st.sidebar:
72 | st.image('logo.png', use_column_width=True)
73 | st.session_state.temperature = st.slider('Temperature:', min_value=0.0, max_value=1.0, value=0.1, step=0.02)
74 | st.session_state.maxlength = st.slider('Length reply:', min_value=150, max_value=1000,
75 | value=500, step=50)
76 | st.session_state.repeat = st.slider('Repeat Penalty:', min_value=0.0, max_value=2.0, value=1.35, step=0.01)
77 | st.markdown(f"**Logfile**: {st.session_state.logfilename}")
78 | btnClear = st.button("Clear History",type="primary", use_container_width=True)
79 |
80 | # We store the conversation in the session state.
81 | # This will be used to render the chat conversation.
82 | # We initialize it with the first message we want to be greeted with.
83 | #Note that the first 3 messages will never be used for the genration, they are only for the Chat interface
84 | if "messages" not in st.session_state:
85 | st.session_state.messages = [
86 | {"role": "system", "content": "You are SmolLM, a helpful assistant. You reply only to the user questions. You always reply in the language of the instructions.",},
87 | {"role": "user", "content": "Hi, I am Fabio."},
88 | {"role": "assistant", "content": "Hi there, I am SmolLM, how may I help you today?"}
89 | ]
90 | # we define the function to clear from the screen the conversation history
91 | def clearHistory():
92 | st.session_state.messages = [
93 | {"role": "system", "content": "You are SmolLM, a helpful assistant. You reply only to the user questions. You always reply in the language of the instructions.",},
94 | {"role": "user", "content": "Hi, I am Fabio."},
95 | {"role": "assistant", "content": "Hi there, I am SmolLM, how may I help you today?"}
96 | ]
97 | if btnClear:
98 | clearHistory()
99 |
100 | # We loop through each message in the session state and render it as # a chat message.
101 | for message in st.session_state.messages[1:]:
102 | if message["role"] == "user":
103 | with st.chat_message(message["role"],avatar=av_us):
104 | st.markdown(message["content"])
105 | else:
106 | with st.chat_message(message["role"],avatar=av_ass):
107 | st.markdown(message["content"])
108 |
109 | # We take questions/instructions from the chat input to pass to the LLM
110 | if user_prompt := st.chat_input("Your message here. Shift+Enter to add a new line", key="user_input"):
111 |
112 | # Add our input to the session state
113 | st.session_state.messages.append(
114 | {"role": "user", "content": user_prompt}
115 | )
116 |
117 | # Add our input to the chat window
118 | with st.chat_message("user", avatar=av_us):
119 | st.markdown(user_prompt)
120 | writehistory(st.session_state.logfilename,f'👷: {user_prompt}')
121 |
122 |
123 | with st.chat_message("assistant",avatar=av_ass):
124 | message_placeholder = st.empty()
125 | with st.spinner("Thinking..."):
126 | response = ''
127 | conv_messages = []
128 | conv_messages.append(st.session_state.messages[-1])
129 | full_response = ""
130 | completion = client.chat.completions.create(
131 | model="local-model", # this field is currently unused
132 | messages=conv_messages, #st.session_state.messages if you want to keep previous messages,
133 | temperature=st.session_state.temperature,
134 | frequency_penalty = st.session_state.repeat,
135 | stop=['<|im_end|>',''],
136 | max_tokens=st.session_state.maxlength,
137 | stream=True,
138 | )
139 | for chunk in completion:
140 | if chunk.choices[0].delta.content:
141 | full_response += chunk.choices[0].delta.content
142 | message_placeholder.markdown(full_response + "🌟")
143 | toregister = full_response + f"""
144 | ```
145 |
146 | prompt tokens: {len(encoding.encode(st.session_state.messages[-1]['content']))}
147 | generated tokens: {len(encoding.encode(full_response))}
148 | ```"""
149 | message_placeholder.markdown(toregister)
150 | writehistory(st.session_state.logfilename,f'🌟: {toregister}\n\n---\n\n')
151 |
152 |
153 | # Add the response to the session state
154 | st.session_state.messages.append(
155 | {"role": "assistant", "content": toregister}
156 | )
157 |
--------------------------------------------------------------------------------
/models_details.txt:
--------------------------------------------------------------------------------
1 | >>> q = Llama(model_path='models\Lite-Oute-1-65M-Instruct-Q8_0.gguf', verbose=True)
2 | llama_model_loader: loaded meta data with 27 key-value pairs and 75 tensors from models\Lite-Oute-1-65M-Instruct-Q8_0.gguf (version GGUF V3 (latest))
3 | llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
4 | llama_model_loader: - kv 0: general.architecture str = llama
5 | llama_model_loader: - kv 1: general.name str = Lite-Oute-1-65M-Instruct
6 | llama_model_loader: - kv 2: llama.block_count u32 = 8
7 | llama_model_loader: - kv 3: llama.context_length u32 = 2048
8 | llama_model_loader: - kv 4: llama.embedding_length u32 = 512
9 | llama_model_loader: - kv 5: llama.feed_forward_length u32 = 2048
10 | llama_model_loader: - kv 6: llama.attention.head_count u32 = 16
11 | llama_model_loader: - kv 7: llama.attention.head_count_kv u32 = 8
12 | llama_model_loader: - kv 8: llama.rope.freq_base f32 = 10000.000000
13 | llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000001
14 | llama_model_loader: - kv 10: general.file_type u32 = 7
15 | llama_model_loader: - kv 11: llama.vocab_size u32 = 32768
16 | llama_model_loader: - kv 12: llama.rope.dimension_count u32 = 32
17 | llama_model_loader: - kv 13: tokenizer.ggml.add_space_prefix bool = true
18 | llama_model_loader: - kv 14: tokenizer.ggml.model str = llama
19 | llama_model_loader: - kv 15: tokenizer.ggml.pre str = default
20 | llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,32768] = ["", "", "", "<0x00>", "<...
21 | llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,32768] = [0.000000, 0.000000, 0.000000, 0.0000...
22 | llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,32768] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
23 | llama_model_loader: - kv 19: tokenizer.ggml.bos_token_id u32 = 1
24 | llama_model_loader: - kv 20: tokenizer.ggml.eos_token_id u32 = 32000
25 | llama_model_loader: - kv 21: tokenizer.ggml.unknown_token_id u32 = 0
26 | llama_model_loader: - kv 22: tokenizer.ggml.padding_token_id u32 = 32000
27 | llama_model_loader: - kv 23: tokenizer.ggml.add_bos_token bool = true
28 | llama_model_loader: - kv 24: tokenizer.ggml.add_eos_token bool = false
29 | llama_model_loader: - kv 25: tokenizer.chat_template str = {% for message in messages %}{{'<|im_...
30 | llama_model_loader: - kv 26: general.quantization_version u32 = 2
31 | llama_model_loader: - type f32: 17 tensors
32 | llama_model_loader: - type q8_0: 58 tensors
33 | llm_load_vocab: special tokens cache size = 771
34 | llm_load_vocab: token to piece cache size = 0.1710 MB
35 | llm_load_print_meta: format = GGUF V3 (latest)
36 | llm_load_print_meta: arch = llama
37 | llm_load_print_meta: vocab type = SPM
38 | llm_load_print_meta: n_vocab = 32768
39 | llm_load_print_meta: n_merges = 0
40 | llm_load_print_meta: vocab_only = 0
41 | llm_load_print_meta: n_ctx_train = 2048
42 | llm_load_print_meta: n_embd = 512
43 | llm_load_print_meta: n_layer = 8
44 | llm_load_print_meta: n_head = 16
45 | llm_load_print_meta: n_head_kv = 8
46 | llm_load_print_meta: n_rot = 32
47 | llm_load_print_meta: n_swa = 0
48 | llm_load_print_meta: n_embd_head_k = 32
49 | llm_load_print_meta: n_embd_head_v = 32
50 | llm_load_print_meta: n_gqa = 2
51 | llm_load_print_meta: n_embd_k_gqa = 256
52 | llm_load_print_meta: n_embd_v_gqa = 256
53 | llm_load_print_meta: f_norm_eps = 0.0e+00
54 | llm_load_print_meta: f_norm_rms_eps = 1.0e-06
55 | llm_load_print_meta: f_clamp_kqv = 0.0e+00
56 | llm_load_print_meta: f_max_alibi_bias = 0.0e+00
57 | llm_load_print_meta: f_logit_scale = 0.0e+00
58 | llm_load_print_meta: n_ff = 2048
59 | llm_load_print_meta: n_expert = 0
60 | llm_load_print_meta: n_expert_used = 0
61 | llm_load_print_meta: causal attn = 1
62 | llm_load_print_meta: pooling type = 0
63 | llm_load_print_meta: rope type = 0
64 | llm_load_print_meta: rope scaling = linear
65 | llm_load_print_meta: freq_base_train = 10000.0
66 | llm_load_print_meta: freq_scale_train = 1
67 | llm_load_print_meta: n_ctx_orig_yarn = 2048
68 | llm_load_print_meta: rope_finetuned = unknown
69 | llm_load_print_meta: ssm_d_conv = 0
70 | llm_load_print_meta: ssm_d_inner = 0
71 | llm_load_print_meta: ssm_d_state = 0
72 | llm_load_print_meta: ssm_dt_rank = 0
73 | llm_load_print_meta: model type = ?B
74 | llm_load_print_meta: model ftype = Q8_0
75 | llm_load_print_meta: model params = 65.02 M
76 | llm_load_print_meta: model size = 65.91 MiB (8.50 BPW)
77 | llm_load_print_meta: general.name = Lite-Oute-1-65M-Instruct
78 | llm_load_print_meta: BOS token = 1 ''
79 | llm_load_print_meta: EOS token = 32000 '<|im_end|>'
80 | llm_load_print_meta: UNK token = 0 ''
81 | llm_load_print_meta: PAD token = 32000 '<|im_end|>'
82 | llm_load_print_meta: LF token = 13 '<0x0A>'
83 | llm_load_print_meta: EOT token = 32000 '<|im_end|>'
84 | llm_load_print_meta: max token length = 48
85 | llm_load_tensors: ggml ctx size = 0.04 MiB
86 | llm_load_tensors: CPU buffer size = 65.91 MiB
87 | ......................................
88 | llama_new_context_with_model: n_ctx = 512
89 | llama_new_context_with_model: n_batch = 512
90 | llama_new_context_with_model: n_ubatch = 512
91 | llama_new_context_with_model: flash_attn = 0
92 | llama_new_context_with_model: freq_base = 10000.0
93 | llama_new_context_with_model: freq_scale = 1
94 | llama_kv_cache_init: CPU KV buffer size = 4.00 MiB
95 | llama_new_context_with_model: KV self size = 4.00 MiB, K (f16): 2.00 MiB, V (f16): 2.00 MiB
96 | llama_new_context_with_model: CPU output buffer size = 0.13 MiB
97 | llama_new_context_with_model: CPU compute buffer size = 65.00 MiB
98 | llama_new_context_with_model: graph nodes = 262
99 | llama_new_context_with_model: graph splits = 1
100 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
101 | Model metadata: {'general.name': 'Lite-Oute-1-65M-Instruct', 'general.architecture': 'llama', 'llama.block_count': '8', 'llama.context_length': '2048', 'tokenizer.ggml.eos_token_id': '32000', 'general.file_type': '7', 'llama.attention.head_count_kv': '8', 'llama.embedding_length': '512', 'llama.feed_forward_length': '2048', 'llama.attention.head_count': '16', 'llama.rope.freq_base': '10000.000000', 'llama.attention.layer_norm_rms_epsilon': '0.000001', 'llama.vocab_size': '32768', 'llama.rope.dimension_count': '32', 'tokenizer.ggml.pre': 'default', 'tokenizer.ggml.add_space_prefix': 'true', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.padding_token_id': '32000', 'tokenizer.ggml.add_bos_token': 'true', 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.chat_template': "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"}
102 | Available chat formats from metadata: chat_template.default
103 | Guessed chat format: chatml
104 |
105 |
106 |
107 | ////////////////////////////////////////////////////////////////////////////////////////////////////////
108 | >>> q = Llama(model_path='models\Lite-Mistral-150M-v2-Instruct-Q8_0.gguf', verbose=True)
109 | llama_model_loader: loaded meta data with 26 key-value pairs and 111 tensors from models\Lite-Mistral-150M-v2-Instruct-Q8_0.gguf (version GGUF V3 (latest))
110 | llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
111 | llama_model_loader: - kv 0: general.architecture str = llama
112 | llama_model_loader: - kv 1: general.name str = Lite-Mistral-150M-v2-Instruct
113 | llama_model_loader: - kv 2: llama.block_count u32 = 12
114 | llama_model_loader: - kv 3: llama.context_length u32 = 2048
115 | llama_model_loader: - kv 4: llama.embedding_length u32 = 768
116 | llama_model_loader: - kv 5: llama.feed_forward_length u32 = 3072
117 | llama_model_loader: - kv 6: llama.attention.head_count u32 = 16
118 | llama_model_loader: - kv 7: llama.attention.head_count_kv u32 = 8
119 | llama_model_loader: - kv 8: llama.rope.freq_base f32 = 10000.000000
120 | llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000001
121 | llama_model_loader: - kv 10: general.file_type u32 = 7
122 | llama_model_loader: - kv 11: llama.vocab_size u32 = 32768
123 | llama_model_loader: - kv 12: llama.rope.dimension_count u32 = 48
124 | llama_model_loader: - kv 13: tokenizer.ggml.add_space_prefix bool = true
125 | llama_model_loader: - kv 14: tokenizer.ggml.model str = llama
126 | llama_model_loader: - kv 15: tokenizer.ggml.pre str = default
127 | llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,32768] = ["", "", "", "<0x00>", "<...
128 | llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,32768] = [0.000000, 0.000000, 0.000000, 0.0000...
129 | llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,32768] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
130 | llama_model_loader: - kv 19: tokenizer.ggml.bos_token_id u32 = 1
131 | llama_model_loader: - kv 20: tokenizer.ggml.eos_token_id u32 = 2
132 | llama_model_loader: - kv 21: tokenizer.ggml.unknown_token_id u32 = 0
133 | llama_model_loader: - kv 22: tokenizer.ggml.add_bos_token bool = true
134 | llama_model_loader: - kv 23: tokenizer.ggml.add_eos_token bool = false
135 | llama_model_loader: - kv 24: tokenizer.chat_template str = {% for message in messages %}{{bos_to...
136 | llama_model_loader: - kv 25: general.quantization_version u32 = 2
137 | llama_model_loader: - type f32: 25 tensors
138 | llama_model_loader: - type q8_0: 86 tensors
139 | llm_load_vocab: special tokens cache size = 771
140 | llm_load_vocab: token to piece cache size = 0.1710 MB
141 | llm_load_print_meta: format = GGUF V3 (latest)
142 | llm_load_print_meta: arch = llama
143 | llm_load_print_meta: vocab type = SPM
144 | llm_load_print_meta: n_vocab = 32768
145 | llm_load_print_meta: n_merges = 0
146 | llm_load_print_meta: vocab_only = 0
147 | llm_load_print_meta: n_ctx_train = 2048
148 | llm_load_print_meta: n_embd = 768
149 | llm_load_print_meta: n_layer = 12
150 | llm_load_print_meta: n_head = 16
151 | llm_load_print_meta: n_head_kv = 8
152 | llm_load_print_meta: n_rot = 48
153 | llm_load_print_meta: n_swa = 0
154 | llm_load_print_meta: n_embd_head_k = 48
155 | llm_load_print_meta: n_embd_head_v = 48
156 | llm_load_print_meta: n_gqa = 2
157 | llm_load_print_meta: n_embd_k_gqa = 384
158 | llm_load_print_meta: n_embd_v_gqa = 384
159 | llm_load_print_meta: f_norm_eps = 0.0e+00
160 | llm_load_print_meta: f_norm_rms_eps = 1.0e-06
161 | llm_load_print_meta: f_clamp_kqv = 0.0e+00
162 | llm_load_print_meta: f_max_alibi_bias = 0.0e+00
163 | llm_load_print_meta: f_logit_scale = 0.0e+00
164 | llm_load_print_meta: n_ff = 3072
165 | llm_load_print_meta: n_expert = 0
166 | llm_load_print_meta: n_expert_used = 0
167 | llm_load_print_meta: causal attn = 1
168 | llm_load_print_meta: pooling type = 0
169 | llm_load_print_meta: rope type = 0
170 | llm_load_print_meta: rope scaling = linear
171 | llm_load_print_meta: freq_base_train = 10000.0
172 | llm_load_print_meta: freq_scale_train = 1
173 | llm_load_print_meta: n_ctx_orig_yarn = 2048
174 | llm_load_print_meta: rope_finetuned = unknown
175 | llm_load_print_meta: ssm_d_conv = 0
176 | llm_load_print_meta: ssm_d_inner = 0
177 | llm_load_print_meta: ssm_d_state = 0
178 | llm_load_print_meta: ssm_dt_rank = 0
179 | llm_load_print_meta: model type = ?B
180 | llm_load_print_meta: model ftype = Q8_0
181 | llm_load_print_meta: model params = 156.52 M
182 | llm_load_print_meta: model size = 158.65 MiB (8.50 BPW)
183 | llm_load_print_meta: general.name = Lite-Mistral-150M-v2-Instruct
184 | llm_load_print_meta: BOS token = 1 ''
185 | llm_load_print_meta: EOS token = 2 ''
186 | llm_load_print_meta: UNK token = 0 ''
187 | llm_load_print_meta: LF token = 13 '<0x0A>'
188 | llm_load_print_meta: max token length = 48
189 | llm_load_tensors: ggml ctx size = 0.05 MiB
190 | llm_load_tensors: CPU buffer size = 158.65 MiB
191 | ..................................................
192 | llama_new_context_with_model: n_ctx = 512
193 | llama_new_context_with_model: n_batch = 512
194 | llama_new_context_with_model: n_ubatch = 512
195 | llama_new_context_with_model: flash_attn = 0
196 | llama_new_context_with_model: freq_base = 10000.0
197 | llama_new_context_with_model: freq_scale = 1
198 | llama_kv_cache_init: CPU KV buffer size = 9.00 MiB
199 | llama_new_context_with_model: KV self size = 9.00 MiB, K (f16): 4.50 MiB, V (f16): 4.50 MiB
200 | llama_new_context_with_model: CPU output buffer size = 0.13 MiB
201 | llama_new_context_with_model: CPU compute buffer size = 65.50 MiB
202 | llama_new_context_with_model: graph nodes = 390
203 | llama_new_context_with_model: graph splits = 1
204 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
205 | Model metadata: {'general.name': 'Lite-Mistral-150M-v2-Instruct', 'general.architecture': 'llama', 'llama.block_count': '12', 'llama.context_length': '2048', 'tokenizer.ggml.eos_token_id': '2', 'general.file_type': '7', 'llama.attention.head_count_kv': '8', 'llama.embedding_length': '768', 'llama.feed_forward_length': '3072', 'llama.attention.head_count': '16', 'llama.rope.freq_base': '10000.000000', 'llama.attention.layer_norm_rms_epsilon': '0.000001', 'llama.vocab_size': '32768', 'llama.rope.dimension_count': '48', 'tokenizer.ggml.pre': 'default', 'tokenizer.ggml.add_space_prefix': 'true', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.add_bos_token': 'true', 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.chat_template': "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}"}
206 | Available chat formats from metadata: chat_template.default
207 | Using gguf chat template: {% for message in messages %}{{bos_token + message['role'] + '
208 | ' + message['content'] + eos_token + '
209 | '}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant
210 | ' }}{% endif %}
211 | Using chat eos_token:
212 | Using chat bos_token:
213 |
214 |
215 | /////////////////////////////////////////////////////////////////////////////////////////////////////
216 | >>> q = Llama(model_path='models\Lite-Oute-1-300M-Instruct-Q8_0.gguf', verbose=True)
217 | llama_model_loader: loaded meta data with 27 key-value pairs and 183 tensors from models\Lite-Oute-1-300M-Instruct-Q8_0.gguf (version GGUF V3 (latest))
218 | llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
219 | llama_model_loader: - kv 0: general.architecture str = llama
220 | llama_model_loader: - kv 1: general.name str = Lite-Oute-1-300M-Instruct
221 | llama_model_loader: - kv 2: llama.block_count u32 = 20
222 | llama_model_loader: - kv 3: llama.context_length u32 = 4096
223 | llama_model_loader: - kv 4: llama.embedding_length u32 = 896
224 | llama_model_loader: - kv 5: llama.feed_forward_length u32 = 3584
225 | llama_model_loader: - kv 6: llama.attention.head_count u32 = 16
226 | llama_model_loader: - kv 7: llama.attention.head_count_kv u32 = 8
227 | llama_model_loader: - kv 8: llama.rope.freq_base f32 = 10000.000000
228 | llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000001
229 | llama_model_loader: - kv 10: general.file_type u32 = 7
230 | llama_model_loader: - kv 11: llama.vocab_size u32 = 32768
231 | llama_model_loader: - kv 12: llama.rope.dimension_count u32 = 56
232 | llama_model_loader: - kv 13: tokenizer.ggml.add_space_prefix bool = true
233 | llama_model_loader: - kv 14: tokenizer.ggml.model str = llama
234 | llama_model_loader: - kv 15: tokenizer.ggml.pre str = default
235 | llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,32768] = ["", "", "", "<0x00>", "<...
236 | llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,32768] = [0.000000, 0.000000, 0.000000, 0.0000...
237 | llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,32768] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
238 | llama_model_loader: - kv 19: tokenizer.ggml.bos_token_id u32 = 1
239 | llama_model_loader: - kv 20: tokenizer.ggml.eos_token_id u32 = 32000
240 | llama_model_loader: - kv 21: tokenizer.ggml.unknown_token_id u32 = 0
241 | llama_model_loader: - kv 22: tokenizer.ggml.padding_token_id u32 = 32000
242 | llama_model_loader: - kv 23: tokenizer.ggml.add_bos_token bool = true
243 | llama_model_loader: - kv 24: tokenizer.ggml.add_eos_token bool = false
244 | llama_model_loader: - kv 25: tokenizer.chat_template str = {% for message in messages %}{{'<|im_...
245 | llama_model_loader: - kv 26: general.quantization_version u32 = 2
246 | llama_model_loader: - type f32: 41 tensors
247 | llama_model_loader: - type q8_0: 142 tensors
248 | llm_load_vocab: special tokens cache size = 771
249 | llm_load_vocab: token to piece cache size = 0.1710 MB
250 | llm_load_print_meta: format = GGUF V3 (latest)
251 | llm_load_print_meta: arch = llama
252 | llm_load_print_meta: vocab type = SPM
253 | llm_load_print_meta: n_vocab = 32768
254 | llm_load_print_meta: n_merges = 0
255 | llm_load_print_meta: vocab_only = 0
256 | llm_load_print_meta: n_ctx_train = 4096
257 | llm_load_print_meta: n_embd = 896
258 | llm_load_print_meta: n_layer = 20
259 | llm_load_print_meta: n_head = 16
260 | llm_load_print_meta: n_head_kv = 8
261 | llm_load_print_meta: n_rot = 56
262 | llm_load_print_meta: n_swa = 0
263 | llm_load_print_meta: n_embd_head_k = 56
264 | llm_load_print_meta: n_embd_head_v = 56
265 | llm_load_print_meta: n_gqa = 2
266 | llm_load_print_meta: n_embd_k_gqa = 448
267 | llm_load_print_meta: n_embd_v_gqa = 448
268 | llm_load_print_meta: f_norm_eps = 0.0e+00
269 | llm_load_print_meta: f_norm_rms_eps = 1.0e-06
270 | llm_load_print_meta: f_clamp_kqv = 0.0e+00
271 | llm_load_print_meta: f_max_alibi_bias = 0.0e+00
272 | llm_load_print_meta: f_logit_scale = 0.0e+00
273 | llm_load_print_meta: n_ff = 3584
274 | llm_load_print_meta: n_expert = 0
275 | llm_load_print_meta: n_expert_used = 0
276 | llm_load_print_meta: causal attn = 1
277 | llm_load_print_meta: pooling type = 0
278 | llm_load_print_meta: rope type = 0
279 | llm_load_print_meta: rope scaling = linear
280 | llm_load_print_meta: freq_base_train = 10000.0
281 | llm_load_print_meta: freq_scale_train = 1
282 | llm_load_print_meta: n_ctx_orig_yarn = 4096
283 | llm_load_print_meta: rope_finetuned = unknown
284 | llm_load_print_meta: ssm_d_conv = 0
285 | llm_load_print_meta: ssm_d_inner = 0
286 | llm_load_print_meta: ssm_d_state = 0
287 | llm_load_print_meta: ssm_dt_rank = 0
288 | llm_load_print_meta: model type = ?B
289 | llm_load_print_meta: model ftype = Q8_0
290 | llm_load_print_meta: model params = 299.60 M
291 | llm_load_print_meta: model size = 303.68 MiB (8.50 BPW)
292 | llm_load_print_meta: general.name = Lite-Oute-1-300M-Instruct
293 | llm_load_print_meta: BOS token = 1 ''
294 | llm_load_print_meta: EOS token = 32000 '<|im_end|>'
295 | llm_load_print_meta: UNK token = 0 ''
296 | llm_load_print_meta: PAD token = 32000 '<|im_end|>'
297 | llm_load_print_meta: LF token = 13 '<0x0A>'
298 | llm_load_print_meta: EOT token = 32000 '<|im_end|>'
299 | llm_load_print_meta: max token length = 48
300 | llm_load_tensors: ggml ctx size = 0.09 MiB
301 | llm_load_tensors: CPU buffer size = 303.68 MiB
302 | ..................................................................................
303 | llama_new_context_with_model: n_ctx = 512
304 | llama_new_context_with_model: n_batch = 512
305 | llama_new_context_with_model: n_ubatch = 512
306 | llama_new_context_with_model: flash_attn = 0
307 | llama_new_context_with_model: freq_base = 10000.0
308 | llama_new_context_with_model: freq_scale = 1
309 | llama_kv_cache_init: CPU KV buffer size = 17.50 MiB
310 | llama_new_context_with_model: KV self size = 17.50 MiB, K (f16): 8.75 MiB, V (f16): 8.75 MiB
311 | llama_new_context_with_model: CPU output buffer size = 0.13 MiB
312 | llama_new_context_with_model: CPU compute buffer size = 65.75 MiB
313 | llama_new_context_with_model: graph nodes = 646
314 | llama_new_context_with_model: graph splits = 1
315 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
316 | Model metadata: {'general.name': 'Lite-Oute-1-300M-Instruct', 'general.architecture': 'llama', 'llama.block_count': '20', 'llama.context_length': '4096', 'tokenizer.ggml.eos_token_id': '32000', 'general.file_type': '7', 'llama.attention.head_count_kv': '8', 'llama.embedding_length': '896', 'llama.feed_forward_length': '3584', 'llama.attention.head_count': '16', 'llama.rope.freq_base': '10000.000000', 'llama.attention.layer_norm_rms_epsilon': '0.000001', 'llama.vocab_size': '32768', 'llama.rope.dimension_count': '56', 'tokenizer.ggml.pre': 'default', 'tokenizer.ggml.add_space_prefix': 'true', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.padding_token_id': '32000', 'tokenizer.ggml.add_bos_token': 'true', 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.chat_template': "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"}
317 | Available chat formats from metadata: chat_template.default
318 | Guessed chat format: chatml
319 | >>>
320 | /////////////////////////////////////////////////////////////////////////////////////////////////////
321 | TINYLLAMA JSON
322 |
323 | >>> q = Llama(model_path='models/unsloth.Q4_K_M.gguf', verbose=True)
324 | llama_model_loader: loaded meta data with 33 key-value pairs and 201 tensors from models/unsloth.Q4_K_M.gguf (version GGUF V3 (latest))
325 | llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
326 | llama_model_loader: - kv 0: general.architecture str = llama
327 | llama_model_loader: - kv 1: general.type str = model
328 | llama_model_loader: - kv 2: general.name str = Tinyllama Bnb 4bit
329 | llama_model_loader: - kv 3: general.organization str = Unsloth
330 | llama_model_loader: - kv 4: general.finetune str = 4bit
331 | llama_model_loader: - kv 5: general.basename str = tinyllama-bnb
332 | llama_model_loader: - kv 6: general.size_label str = 1.1B
333 | llama_model_loader: - kv 7: llama.block_count u32 = 22
334 | llama_model_loader: - kv 8: llama.context_length u32 = 4096
335 | llama_model_loader: - kv 9: llama.embedding_length u32 = 2048
336 | llama_model_loader: - kv 10: llama.feed_forward_length u32 = 5632
337 | llama_model_loader: - kv 11: llama.attention.head_count u32 = 32
338 | llama_model_loader: - kv 12: llama.attention.head_count_kv u32 = 4
339 | llama_model_loader: - kv 13: llama.rope.freq_base f32 = 10000.000000
340 | llama_model_loader: - kv 14: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
341 | llama_model_loader: - kv 15: general.file_type u32 = 15
342 | llama_model_loader: - kv 16: llama.vocab_size u32 = 32000
343 | llama_model_loader: - kv 17: llama.rope.dimension_count u32 = 64
344 | llama_model_loader: - kv 18: llama.rope.scaling.type str = linear
345 | llama_model_loader: - kv 19: llama.rope.scaling.factor f32 = 2.000000
346 | llama_model_loader: - kv 20: tokenizer.ggml.add_space_prefix bool = false
347 | llama_model_loader: - kv 21: tokenizer.ggml.model str = llama
348 | llama_model_loader: - kv 22: tokenizer.ggml.pre str = default
349 | llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<...
350 | llama_model_loader: - kv 24: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
351 | llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
352 | llama_model_loader: - kv 26: tokenizer.ggml.bos_token_id u32 = 1
353 | llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 2
354 | llama_model_loader: - kv 28: tokenizer.ggml.unknown_token_id u32 = 0
355 | llama_model_loader: - kv 29: tokenizer.ggml.padding_token_id u32 = 0
356 | llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true
357 | llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = false
358 | llama_model_loader: - kv 32: general.quantization_version u32 = 2
359 | llama_model_loader: - type f32: 45 tensors
360 | llama_model_loader: - type q4_K: 135 tensors
361 | llama_model_loader: - type q6_K: 21 tensors
362 | llm_load_vocab: special tokens cache size = 3
363 | llm_load_vocab: token to piece cache size = 0.1684 MB
364 | llm_load_print_meta: format = GGUF V3 (latest)
365 | llm_load_print_meta: arch = llama
366 | llm_load_print_meta: vocab type = SPM
367 | llm_load_print_meta: n_vocab = 32000
368 | llm_load_print_meta: n_merges = 0
369 | llm_load_print_meta: vocab_only = 0
370 | llm_load_print_meta: n_ctx_train = 4096
371 | llm_load_print_meta: n_embd = 2048
372 | llm_load_print_meta: n_layer = 22
373 | llm_load_print_meta: n_head = 32
374 | llm_load_print_meta: n_head_kv = 4
375 | llm_load_print_meta: n_rot = 64
376 | llm_load_print_meta: n_swa = 0
377 | llm_load_print_meta: n_embd_head_k = 64
378 | llm_load_print_meta: n_embd_head_v = 64
379 | llm_load_print_meta: n_gqa = 8
380 | llm_load_print_meta: n_embd_k_gqa = 256
381 | llm_load_print_meta: n_embd_v_gqa = 256
382 | llm_load_print_meta: f_norm_eps = 0.0e+00
383 | llm_load_print_meta: f_norm_rms_eps = 1.0e-05
384 | llm_load_print_meta: f_clamp_kqv = 0.0e+00
385 | llm_load_print_meta: f_max_alibi_bias = 0.0e+00
386 | llm_load_print_meta: f_logit_scale = 0.0e+00
387 | llm_load_print_meta: n_ff = 5632
388 | llm_load_print_meta: n_expert = 0
389 | llm_load_print_meta: n_expert_used = 0
390 | llm_load_print_meta: causal attn = 1
391 | llm_load_print_meta: pooling type = 0
392 | llm_load_print_meta: rope type = 0
393 | llm_load_print_meta: rope scaling = linear
394 | llm_load_print_meta: freq_base_train = 10000.0
395 | llm_load_print_meta: freq_scale_train = 0.5
396 | llm_load_print_meta: n_ctx_orig_yarn = 4096
397 | llm_load_print_meta: rope_finetuned = unknown
398 | llm_load_print_meta: ssm_d_conv = 0
399 | llm_load_print_meta: ssm_d_inner = 0
400 | llm_load_print_meta: ssm_d_state = 0
401 | llm_load_print_meta: ssm_dt_rank = 0
402 | llm_load_print_meta: model type = 1B
403 | llm_load_print_meta: model ftype = Q4_K - Medium
404 | llm_load_print_meta: model params = 1.10 B
405 | llm_load_print_meta: model size = 636.18 MiB (4.85 BPW)
406 | llm_load_print_meta: general.name = Tinyllama Bnb 4bit
407 | llm_load_print_meta: BOS token = 1 ''
408 | llm_load_print_meta: EOS token = 2 ''
409 | llm_load_print_meta: UNK token = 0 ''
410 | llm_load_print_meta: PAD token = 0 ''
411 | llm_load_print_meta: LF token = 13 '<0x0A>'
412 | llm_load_print_meta: max token length = 48
413 | llm_load_tensors: ggml ctx size = 0.09 MiB
414 | llm_load_tensors: CPU buffer size = 636.18 MiB
415 | ....................................................................................
416 | llama_new_context_with_model: n_ctx = 512
417 | llama_new_context_with_model: n_batch = 512
418 | llama_new_context_with_model: n_ubatch = 512
419 | llama_new_context_with_model: flash_attn = 0
420 | llama_new_context_with_model: freq_base = 10000.0
421 | llama_new_context_with_model: freq_scale = 0.5
422 | llama_kv_cache_init: CPU KV buffer size = 11.00 MiB
423 | llama_new_context_with_model: KV self size = 11.00 MiB, K (f16): 5.50 MiB, V (f16): 5.50 MiB
424 | llama_new_context_with_model: CPU output buffer size = 0.12 MiB
425 | llama_new_context_with_model: CPU compute buffer size = 66.50 MiB
426 | llama_new_context_with_model: graph nodes = 710
427 | llama_new_context_with_model: graph splits = 1
428 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
429 | Model metadata: {'general.name': 'Tinyllama Bnb 4bit', 'general.architecture': 'llama', 'general.type': 'model', 'llama.context_length': '4096', 'general.organization': 'Unsloth', 'llama.block_count': '22', 'general.basename': 'tinyllama-bnb', 'general.finetune': '4bit', 'general.size_label': '1.1B', 'llama.embedding_length': '2048', 'llama.feed_forward_length': '5632', 'llama.attention.head_count': '32', 'tokenizer.ggml.eos_token_id': '2', 'general.file_type': '15', 'llama.attention.head_count_kv': '4', 'llama.rope.freq_base': '10000.000000', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.vocab_size': '32000', 'llama.rope.dimension_count': '64', 'llama.rope.scaling.type': 'linear', 'llama.rope.scaling.factor': '2.000000', 'tokenizer.ggml.pre': 'default', 'tokenizer.ggml.add_space_prefix': 'false', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.add_bos_token': 'true', 'tokenizer.ggml.add_eos_token': 'false'}
430 | Using fallback chat format: llama-2
431 |
432 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////
433 |
434 | TINYLLAMA 2B CTX 2048 no CHAT
435 | >>> q = Llama(model_path='models/Tinyllama-2B-Q8_0.gguf', verbose=True)
436 | llama_model_loader: loaded meta data with 26 key-value pairs and 399 tensors from models/Tinyllama-2B-Q8_0.gguf (version GGUF V3 (latest))
437 | llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
438 | llama_model_loader: - kv 0: general.architecture str = llama
439 | llama_model_loader: - kv 1: general.name str = model
440 | llama_model_loader: - kv 2: llama.block_count u32 = 44
441 | llama_model_loader: - kv 3: llama.context_length u32 = 2048
442 | llama_model_loader: - kv 4: llama.embedding_length u32 = 2048
443 | llama_model_loader: - kv 5: llama.feed_forward_length u32 = 5632
444 | llama_model_loader: - kv 6: llama.attention.head_count u32 = 32
445 | llama_model_loader: - kv 7: llama.attention.head_count_kv u32 = 4
446 | llama_model_loader: - kv 8: llama.rope.freq_base f32 = 10000.000000
447 | llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
448 | llama_model_loader: - kv 10: general.file_type u32 = 7
449 | llama_model_loader: - kv 11: llama.vocab_size u32 = 32000
450 | llama_model_loader: - kv 12: llama.rope.dimension_count u32 = 64
451 | llama_model_loader: - kv 13: tokenizer.ggml.add_space_prefix bool = false
452 | llama_model_loader: - kv 14: tokenizer.ggml.model str = llama
453 | llama_model_loader: - kv 15: tokenizer.ggml.pre str = default
454 | llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<...
455 | llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
456 | llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
457 | llama_model_loader: - kv 19: tokenizer.ggml.bos_token_id u32 = 1
458 | llama_model_loader: - kv 20: tokenizer.ggml.eos_token_id u32 = 2
459 | llama_model_loader: - kv 21: tokenizer.ggml.unknown_token_id u32 = 0
460 | llama_model_loader: - kv 22: tokenizer.ggml.padding_token_id u32 = 0
461 | llama_model_loader: - kv 23: tokenizer.ggml.add_bos_token bool = true
462 | llama_model_loader: - kv 24: tokenizer.ggml.add_eos_token bool = false
463 | llama_model_loader: - kv 25: general.quantization_version u32 = 2
464 | llama_model_loader: - type f32: 89 tensors
465 | llama_model_loader: - type q8_0: 310 tensors
466 | llm_load_vocab: special tokens cache size = 3
467 | llm_load_vocab: token to piece cache size = 0.1684 MB
468 | llm_load_print_meta: format = GGUF V3 (latest)
469 | llm_load_print_meta: arch = llama
470 | llm_load_print_meta: vocab type = SPM
471 | llm_load_print_meta: n_vocab = 32000
472 | llm_load_print_meta: n_merges = 0
473 | llm_load_print_meta: vocab_only = 0
474 | llm_load_print_meta: n_ctx_train = 2048
475 | llm_load_print_meta: n_embd = 2048
476 | llm_load_print_meta: n_layer = 44
477 | llm_load_print_meta: n_head = 32
478 | llm_load_print_meta: n_head_kv = 4
479 | llm_load_print_meta: n_rot = 64
480 | llm_load_print_meta: n_swa = 0
481 | llm_load_print_meta: n_embd_head_k = 64
482 | llm_load_print_meta: n_embd_head_v = 64
483 | llm_load_print_meta: n_gqa = 8
484 | llm_load_print_meta: n_embd_k_gqa = 256
485 | llm_load_print_meta: n_embd_v_gqa = 256
486 | llm_load_print_meta: f_norm_eps = 0.0e+00
487 | llm_load_print_meta: f_norm_rms_eps = 1.0e-05
488 | llm_load_print_meta: f_clamp_kqv = 0.0e+00
489 | llm_load_print_meta: f_max_alibi_bias = 0.0e+00
490 | llm_load_print_meta: f_logit_scale = 0.0e+00
491 | llm_load_print_meta: n_ff = 5632
492 | llm_load_print_meta: n_expert = 0
493 | llm_load_print_meta: n_expert_used = 0
494 | llm_load_print_meta: causal attn = 1
495 | llm_load_print_meta: pooling type = 0
496 | llm_load_print_meta: rope type = 0
497 | llm_load_print_meta: rope scaling = linear
498 | llm_load_print_meta: freq_base_train = 10000.0
499 | llm_load_print_meta: freq_scale_train = 1
500 | llm_load_print_meta: n_ctx_orig_yarn = 2048
501 | llm_load_print_meta: rope_finetuned = unknown
502 | llm_load_print_meta: ssm_d_conv = 0
503 | llm_load_print_meta: ssm_d_inner = 0
504 | llm_load_print_meta: ssm_d_state = 0
505 | llm_load_print_meta: ssm_dt_rank = 0
506 | llm_load_print_meta: model type = ?B
507 | llm_load_print_meta: model ftype = Q8_0
508 | llm_load_print_meta: model params = 2.07 B
509 | llm_load_print_meta: model size = 2.05 GiB (8.50 BPW)
510 | llm_load_print_meta: general.name = model
511 | llm_load_print_meta: BOS token = 1 ''
512 | llm_load_print_meta: EOS token = 2 ''
513 | llm_load_print_meta: UNK token = 0 ''
514 | llm_load_print_meta: PAD token = 0 ''
515 | llm_load_print_meta: LF token = 13 '<0x0A>'
516 | llm_load_print_meta: max token length = 48
517 | llm_load_tensors: ggml ctx size = 0.19 MiB
518 | llm_load_tensors: CPU buffer size = 2097.01 MiB
519 | ................................................................................................
520 | llama_new_context_with_model: n_ctx = 512
521 | llama_new_context_with_model: n_batch = 512
522 | llama_new_context_with_model: n_ubatch = 512
523 | llama_new_context_with_model: flash_attn = 0
524 | llama_new_context_with_model: freq_base = 10000.0
525 | llama_new_context_with_model: freq_scale = 1
526 | llama_kv_cache_init: CPU KV buffer size = 22.00 MiB
527 | llama_new_context_with_model: KV self size = 22.00 MiB, K (f16): 11.00 MiB, V (f16): 11.00 MiB
528 | llama_new_context_with_model: CPU output buffer size = 0.12 MiB
529 | llama_new_context_with_model: CPU compute buffer size = 66.50 MiB
530 | llama_new_context_with_model: graph nodes = 1414
531 | llama_new_context_with_model: graph splits = 1
532 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
533 | Model metadata: {'general.name': 'model', 'general.architecture': 'llama', 'llama.block_count': '44', 'llama.context_length': '2048', 'tokenizer.ggml.eos_token_id': '2', 'general.file_type': '7', 'llama.attention.head_count_kv': '4', 'llama.embedding_length': '2048', 'llama.feed_forward_length': '5632', 'llama.attention.head_count': '32', 'llama.rope.freq_base': '10000.000000', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.vocab_size': '32000', 'llama.rope.dimension_count': '64', 'tokenizer.ggml.pre': 'default', 'tokenizer.ggml.add_space_prefix': 'false', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.add_bos_token': 'true', 'tokenizer.ggml.add_eos_token': 'false'}
534 | Using fallback chat format: llama-2
535 |
536 |
537 | ////////////////////////////////////////////////////////////////////////////////////////////////////
538 | >>> q = Llama(model_path='models/Ci-0_5B-Chat.Q8_0.gguf', verbose=True)
539 | llama_model_loader: loaded meta data with 34 key-value pairs and 290 tensors from models/Ci-0_5B-Chat.Q8_0.gguf (version GGUF V3 (latest))
540 | llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
541 | llama_model_loader: - kv 0: general.architecture str = qwen2
542 | llama_model_loader: - kv 1: general.type str = model
543 | llama_model_loader: - kv 2: general.name str = Ci 0_5B Chat
544 | llama_model_loader: - kv 3: general.organization str = LLMCi
545 | llama_model_loader: - kv 4: general.finetune str = Chat
546 | llama_model_loader: - kv 5: general.basename str = Ci
547 | llama_model_loader: - kv 6: general.size_label str = 0.5B
548 | llama_model_loader: - kv 7: general.tags arr[str,3] = ["cible", "trl", "sft"]
549 | llama_model_loader: - kv 8: qwen2.block_count u32 = 24
550 | llama_model_loader: - kv 9: qwen2.context_length u32 = 32768
551 | llama_model_loader: - kv 10: qwen2.embedding_length u32 = 1024
552 | llama_model_loader: - kv 11: qwen2.feed_forward_length u32 = 2816
553 | llama_model_loader: - kv 12: qwen2.attention.head_count u32 = 16
554 | llama_model_loader: - kv 13: qwen2.attention.head_count_kv u32 = 16
555 | llama_model_loader: - kv 14: qwen2.rope.freq_base f32 = 1000000.000000
556 | llama_model_loader: - kv 15: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001
557 | llama_model_loader: - kv 16: general.file_type u32 = 7
558 | llama_model_loader: - kv 17: tokenizer.ggml.model str = gpt2
559 | llama_model_loader: - kv 18: tokenizer.ggml.pre str = qwen2
560 | llama_model_loader: - kv 19: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
561 | llama_model_loader: - kv 20: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
562 | llama_model_loader: - kv 21: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
563 | llama_model_loader: - kv 22: tokenizer.ggml.eos_token_id u32 = 151645
564 | llama_model_loader: - kv 23: tokenizer.ggml.padding_token_id u32 = 151643
565 | llama_model_loader: - kv 24: tokenizer.ggml.bos_token_id u32 = 151643
566 | llama_model_loader: - kv 25: tokenizer.chat_template str = {% for message in messages %}{% if lo...
567 | llama_model_loader: - kv 26: general.quantization_version u32 = 2
568 | llama_model_loader: - kv 27: general.url str = https://huggingface.co/mradermacher/C...
569 | llama_model_loader: - kv 28: mradermacher.quantize_version str = 2
570 | llama_model_loader: - kv 29: mradermacher.quantized_by str = mradermacher
571 | llama_model_loader: - kv 30: mradermacher.quantized_at str = 2024-07-29T18:49:33+02:00
572 | llama_model_loader: - kv 31: mradermacher.quantized_on str = leia
573 | llama_model_loader: - kv 32: general.source.url str = https://huggingface.co/LLMCi/Ci-0_5B-...
574 | llama_model_loader: - kv 33: mradermacher.convert_type str = hf
575 | llama_model_loader: - type f32: 121 tensors
576 | llama_model_loader: - type q8_0: 169 tensors
577 | llm_load_vocab: special tokens cache size = 3
578 | llm_load_vocab: token to piece cache size = 0.9308 MB
579 | llm_load_print_meta: format = GGUF V3 (latest)
580 | llm_load_print_meta: arch = qwen2
581 | llm_load_print_meta: vocab type = BPE
582 | llm_load_print_meta: n_vocab = 151936
583 | llm_load_print_meta: n_merges = 151387
584 | llm_load_print_meta: vocab_only = 0
585 | llm_load_print_meta: n_ctx_train = 32768
586 | llm_load_print_meta: n_embd = 1024
587 | llm_load_print_meta: n_layer = 24
588 | llm_load_print_meta: n_head = 16
589 | llm_load_print_meta: n_head_kv = 16
590 | llm_load_print_meta: n_rot = 64
591 | llm_load_print_meta: n_swa = 0
592 | llm_load_print_meta: n_embd_head_k = 64
593 | llm_load_print_meta: n_embd_head_v = 64
594 | llm_load_print_meta: n_gqa = 1
595 | llm_load_print_meta: n_embd_k_gqa = 1024
596 | llm_load_print_meta: n_embd_v_gqa = 1024
597 | llm_load_print_meta: f_norm_eps = 0.0e+00
598 | llm_load_print_meta: f_norm_rms_eps = 1.0e-06
599 | llm_load_print_meta: f_clamp_kqv = 0.0e+00
600 | llm_load_print_meta: f_max_alibi_bias = 0.0e+00
601 | llm_load_print_meta: f_logit_scale = 0.0e+00
602 | llm_load_print_meta: n_ff = 2816
603 | llm_load_print_meta: n_expert = 0
604 | llm_load_print_meta: n_expert_used = 0
605 | llm_load_print_meta: causal attn = 1
606 | llm_load_print_meta: pooling type = 0
607 | llm_load_print_meta: rope type = 2
608 | llm_load_print_meta: rope scaling = linear
609 | llm_load_print_meta: freq_base_train = 1000000.0
610 | llm_load_print_meta: freq_scale_train = 1
611 | llm_load_print_meta: n_ctx_orig_yarn = 32768
612 | llm_load_print_meta: rope_finetuned = unknown
613 | llm_load_print_meta: ssm_d_conv = 0
614 | llm_load_print_meta: ssm_d_inner = 0
615 | llm_load_print_meta: ssm_d_state = 0
616 | llm_load_print_meta: ssm_dt_rank = 0
617 | llm_load_print_meta: model type = 0.5B
618 | llm_load_print_meta: model ftype = Q8_0
619 | llm_load_print_meta: model params = 463.99 M
620 | llm_load_print_meta: model size = 470.50 MiB (8.51 BPW)
621 | llm_load_print_meta: general.name = Ci 0_5B Chat
622 | llm_load_print_meta: BOS token = 151643 '<|endoftext|>'
623 | llm_load_print_meta: EOS token = 151645 '<|im_end|>'
624 | llm_load_print_meta: PAD token = 151643 '<|endoftext|>'
625 | llm_load_print_meta: LF token = 148848 'ÄĬ'
626 | llm_load_print_meta: EOT token = 151645 '<|im_end|>'
627 | llm_load_print_meta: max token length = 256
628 | llm_load_tensors: ggml ctx size = 0.13 MiB
629 | llm_load_tensors: CPU buffer size = 470.50 MiB
630 | ....................................................
631 | llama_new_context_with_model: n_ctx = 512
632 | llama_new_context_with_model: n_batch = 512
633 | llama_new_context_with_model: n_ubatch = 512
634 | llama_new_context_with_model: flash_attn = 0
635 | llama_new_context_with_model: freq_base = 1000000.0
636 | llama_new_context_with_model: freq_scale = 1
637 | llama_kv_cache_init: CPU KV buffer size = 48.00 MiB
638 | llama_new_context_with_model: KV self size = 48.00 MiB, K (f16): 24.00 MiB, V (f16): 24.00 MiB
639 | llama_new_context_with_model: CPU output buffer size = 0.58 MiB
640 | llama_new_context_with_model: CPU compute buffer size = 298.75 MiB
641 | llama_new_context_with_model: graph nodes = 846
642 | llama_new_context_with_model: graph splits = 1
643 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
644 | Model metadata: {'mradermacher.convert_type': 'hf', 'general.name': 'Ci 0_5B Chat', 'general.architecture': 'qwen2', 'general.type': 'model', 'general.organization': 'LLMCi', 'general.basename': 'Ci', 'general.finetune': 'Chat', 'qwen2.block_count': '24', 'mradermacher.quantized_on': 'leia', 'general.size_label': '0.5B', 'qwen2.context_length': '32768', 'general.url': 'https://huggingface.co/mradermacher/Ci-0_5B-Chat-GGUF', 'qwen2.embedding_length': '1024', 'general.source.url': 'https://huggingface.co/LLMCi/Ci-0_5B-Chat', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '151643', 'qwen2.feed_forward_length': '2816', 'qwen2.attention.head_count': '16', 'qwen2.attention.head_count_kv': '16', 'tokenizer.ggml.padding_token_id': '151643', 'qwen2.rope.freq_base': '1000000.000000', 'qwen2.attention.layer_norm_rms_epsilon': '0.000001', 'tokenizer.ggml.eos_token_id': '151645', 'general.file_type': '7', 'tokenizer.ggml.model': 'gpt2', 'tokenizer.ggml.pre': 'qwen2', 'tokenizer.chat_template': "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", 'mradermacher.quantize_version': '2', 'mradermacher.quantized_by': 'mradermacher', 'mradermacher.quantized_at': '2024-07-29T18:49:33+02:00'}
645 | Available chat formats from metadata: chat_template.default
646 | Using gguf chat template: {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
647 | You are a helpful assistant.<|im_end|>
648 | ' }}{% endif %}{{'<|im_start|>' + message['role'] + '
649 | ' + message['content'] + '<|im_end|>' + '
650 | '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
651 | ' }}{% endif %}
652 | Using chat eos_token: <|im_end|>
653 | Using chat bos_token: <|endoftext|>
654 | >>>
655 |
656 |
657 | //////////////////////////////////////////////////////////////////////////////
658 | >>> q = Llama(model_path='models/openelm-270m-instruct-q8_0.gguf', verbose=True)
659 | llama_model_loader: loaded meta data with 25 key-value pairs and 146 tensors from models/openelm-270m-instruct-q8_0.gguf (version GGUF V3 (latest))
660 | llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
661 | llama_model_loader: - kv 0: general.architecture str = openelm
662 | llama_model_loader: - kv 1: general.name str = OpenELM-270M-Instruct
663 | llama_model_loader: - kv 2: openelm.block_count u32 = 16
664 | llama_model_loader: - kv 3: openelm.context_length u32 = 2048
665 | llama_model_loader: - kv 4: openelm.embedding_length u32 = 1280
666 | llama_model_loader: - kv 5: openelm.feed_forward_length arr[i32,16] = [768, 1024, 1280, 1536, 1792, 2048, 2...
667 | llama_model_loader: - kv 6: openelm.attention.head_count arr[i32,16] = [12, 12, 12, 12, 12, 16, 16, 16, 16, ...
668 | llama_model_loader: - kv 7: openelm.attention.head_count_kv arr[i32,16] = [3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, ...
669 | llama_model_loader: - kv 8: openelm.rope.freq_base f32 = 10000.000000
670 | llama_model_loader: - kv 9: openelm.attention.layer_norm_rms_epsilon f32 = 0.000001
671 | llama_model_loader: - kv 10: openelm.rope.dimension_count u32 = 64
672 | llama_model_loader: - kv 11: openelm.attention.key_length u32 = 64
673 | llama_model_loader: - kv 12: openelm.attention.value_length u32 = 64
674 | llama_model_loader: - kv 13: general.file_type u32 = 7
675 | llama_model_loader: - kv 14: tokenizer.ggml.model str = llama
676 | llama_model_loader: - kv 15: tokenizer.ggml.pre str = default
677 | llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<...
678 | llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...
679 | llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
680 | llama_model_loader: - kv 19: tokenizer.ggml.bos_token_id u32 = 1
681 | llama_model_loader: - kv 20: tokenizer.ggml.eos_token_id u32 = 2
682 | llama_model_loader: - kv 21: tokenizer.ggml.unknown_token_id u32 = 0
683 | llama_model_loader: - kv 22: tokenizer.ggml.add_bos_token bool = true
684 | llama_model_loader: - kv 23: tokenizer.ggml.add_eos_token bool = false
685 | llama_model_loader: - kv 24: general.quantization_version u32 = 2
686 | llama_model_loader: - type f32: 65 tensors
687 | llama_model_loader: - type q8_0: 81 tensors
688 | llm_load_vocab: special tokens cache size = 3
689 | llm_load_vocab: token to piece cache size = 0.1684 MB
690 | llm_load_print_meta: format = GGUF V3 (latest)
691 | llm_load_print_meta: arch = openelm
692 | llm_load_print_meta: vocab type = SPM
693 | llm_load_print_meta: n_vocab = 32000
694 | llm_load_print_meta: n_merges = 0
695 | llm_load_print_meta: vocab_only = 0
696 | llm_load_print_meta: n_ctx_train = 2048
697 | llm_load_print_meta: n_embd = 1280
698 | llm_load_print_meta: n_layer = 16
699 | llm_load_print_meta: n_head = [12, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20]
700 | llm_load_print_meta: n_head_kv = [3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5]
701 | llm_load_print_meta: n_rot = 64
702 | llm_load_print_meta: n_swa = 0
703 | llm_load_print_meta: n_embd_head_k = 64
704 | llm_load_print_meta: n_embd_head_v = 64
705 | llm_load_print_meta: n_gqa = 4
706 | llm_load_print_meta: n_embd_k_gqa = [192, 192, 192, 192, 192, 256, 256, 256, 256, 256, 256, 256, 320, 320, 320, 320]
707 | llm_load_print_meta: n_embd_v_gqa = [192, 192, 192, 192, 192, 256, 256, 256, 256, 256, 256, 256, 320, 320, 320, 320]
708 | llm_load_print_meta: f_norm_eps = 0.0e+00
709 | llm_load_print_meta: f_norm_rms_eps = 1.0e-06
710 | llm_load_print_meta: f_clamp_kqv = 0.0e+00
711 | llm_load_print_meta: f_max_alibi_bias = 0.0e+00
712 | llm_load_print_meta: f_logit_scale = 0.0e+00
713 | llm_load_print_meta: n_ff = [768, 1024, 1280, 1536, 1792, 2048, 2560, 2816, 3072, 3328, 3584, 3840, 4352, 4608, 4864, 5120]
714 | llm_load_print_meta: n_expert = 0
715 | llm_load_print_meta: n_expert_used = 0
716 | llm_load_print_meta: causal attn = 1
717 | llm_load_print_meta: pooling type = 0
718 | llm_load_print_meta: rope type = 2
719 | llm_load_print_meta: rope scaling = linear
720 | llm_load_print_meta: freq_base_train = 10000.0
721 | llm_load_print_meta: freq_scale_train = 1
722 | llm_load_print_meta: n_ctx_orig_yarn = 2048
723 | llm_load_print_meta: rope_finetuned = unknown
724 | llm_load_print_meta: ssm_d_conv = 0
725 | llm_load_print_meta: ssm_d_inner = 0
726 | llm_load_print_meta: ssm_d_state = 0
727 | llm_load_print_meta: ssm_dt_rank = 0
728 | llm_load_print_meta: model type = 270M
729 | llm_load_print_meta: model ftype = Q8_0
730 | llm_load_print_meta: model params = 271.53 M
731 | llm_load_print_meta: model size = 275.26 MiB (8.50 BPW)
732 | llm_load_print_meta: general.name = OpenELM-270M-Instruct
733 | llm_load_print_meta: BOS token = 1 ''
734 | llm_load_print_meta: EOS token = 2 ''
735 | llm_load_print_meta: UNK token = 0 ''
736 | llm_load_print_meta: LF token = 13 '<0x0A>'
737 | llm_load_print_meta: max token length = 48
738 | llm_load_tensors: ggml ctx size = 0.07 MiB
739 | llm_load_tensors: CPU buffer size = 275.26 MiB
740 | .........................................................
741 | llama_new_context_with_model: n_ctx = 512
742 | llama_new_context_with_model: n_batch = 512
743 | llama_new_context_with_model: n_ubatch = 512
744 | llama_new_context_with_model: flash_attn = 0
745 | llama_new_context_with_model: freq_base = 10000.0
746 | llama_new_context_with_model: freq_scale = 1
747 | llama_kv_cache_init: CPU KV buffer size = 7.88 MiB
748 | llama_new_context_with_model: KV self size = 7.88 MiB, K (f16): 3.94 MiB, V (f16): 3.94 MiB
749 | llama_new_context_with_model: CPU output buffer size = 0.12 MiB
750 | llama_new_context_with_model: CPU compute buffer size = 68.51 MiB
751 | llama_new_context_with_model: graph nodes = 646
752 | llama_new_context_with_model: graph splits = 1
753 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
754 | Model metadata: {'general.name': 'OpenELM-270M-Instruct', 'general.architecture': 'openelm', 'openelm.block_count': '16', 'tokenizer.ggml.add_bos_token': 'true', 'openelm.rope.freq_base': '10000.000000', 'openelm.attention.layer_norm_rms_epsilon': '0.000001', 'openelm.context_length': '2048', 'openelm.attention.value_length': '64', 'openelm.embedding_length': '1280', 'openelm.rope.dimension_count': '64', 'openelm.attention.key_length': '64', 'tokenizer.ggml.eos_token_id': '2', 'general.file_type': '7', 'tokenizer.ggml.model': 'llama', 'tokenizer.ggml.pre': 'default', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.add_eos_token': 'false'}
755 | Using fallback chat format: llama-2
756 |
757 |
758 |
759 |
760 |
761 |
762 |
--------------------------------------------------------------------------------