├── img
    ├── yourassetsHere.md
    ├── bot.jpg
    ├── gemma.png
    ├── ibm.jpg
    ├── qwen.png
    ├── user.jpg
    ├── granite3.png
    ├── mockup001.png
    ├── thumbnail.png
    ├── final-mockup001.png
    ├── gemma2-2b-myGGUF.png
    └── videoframe_11799.png
├── requirements.txt
├── testGranite.py
├── layout.py
├── layout2OpenAI.py
├── layout3Ollama-Granite.py
├── promptLibv2.py
├── promptLibv2Qwen.py
├── usefulResources.md
├── layout4Ollama-Granite_autotest.py
├── 02.GR-Ollama-Granite_autotest.py
├── 02.GR-Ollama-Gemma2B_autotest.py
├── 02.GR-Llamafile-Gemma2B_autotest.py
├── README.md
└── gradio-rbyf_chat.py


/img/yourassetsHere.md:
--------------------------------------------------------------------------------
1 | here the images
2 | 


--------------------------------------------------------------------------------
/img/bot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/bot.jpg


--------------------------------------------------------------------------------
/img/gemma.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/gemma.png


--------------------------------------------------------------------------------
/img/ibm.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/ibm.jpg


--------------------------------------------------------------------------------
/img/qwen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/qwen.png


--------------------------------------------------------------------------------
/img/user.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/user.jpg


--------------------------------------------------------------------------------
/img/granite3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/granite3.png


--------------------------------------------------------------------------------
/img/mockup001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/mockup001.png


--------------------------------------------------------------------------------
/img/thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/thumbnail.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/requirements.txt


--------------------------------------------------------------------------------
/img/final-mockup001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/final-mockup001.png


--------------------------------------------------------------------------------
/img/gemma2-2b-myGGUF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/gemma2-2b-myGGUF.png


--------------------------------------------------------------------------------
/img/videoframe_11799.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabiomatricardi/GradioRBYF/main/img/videoframe_11799.png


--------------------------------------------------------------------------------
/testGranite.py:
--------------------------------------------------------------------------------
 1 | # Chat with an intelligent assistant in your terminal  
 2 | # MODEL: ollama-granite3dense
 3 | # this wil run granite3-2B-instruct through ollamaAPI
 4 | """
 5 | > ollama show granite3-dense
 6 |   Model
 7 |     architecture        granite
 8 |     parameters          2.6B
 9 |     context length      4096
10 |     embedding length    2048
11 |     quantization        Q4_K_M
12 | 
13 |   License
14 |     Apache License
15 |     Version 2.0, January 2004
16 | """
17 | import sys
18 | from time import sleep
19 | import warnings
20 | warnings.filterwarnings(action='ignore')
21 | import datetime
22 | from promptLibv2 import countTokens, writehistory, createCatalog
23 | from promptLibv2 import genRANstring, createStats
24 | import argparse
25 | from openai import OpenAI
26 | 
27 | #Add GPU argument in the parser
28 | parser = argparse.ArgumentParser()
29 | parser.add_argument("-g", "--gpu", action="store_true")
30 | 
31 | args = parser.parse_args()
32 | GPU = args.gpu
33 | if GPU:
34 |     ngpu_layers = 2
35 |     print(f'Selected GPU: offloading {ngpu_layers} layers...')
36 | else:
37 |      ngpu_layers = 0   #out of 28
38 |      print('Loading Model on CPU only......')
39 | 
40 | stops = ['<|end_of_text|>']
41 | tasks = createCatalog()
42 | modelname = 'granite3-dense:2b'
43 | # create THE LOG FILE 
44 | logfile = f'logs/{modelname}_CHAT_OLLAMA_{genRANstring(5)}_log.txt'
45 | logfilename = logfile
46 | #Write in the history the first 2 sessions
47 | writehistory(logfilename,f'{str(datetime.datetime.now())}\n\nYour own LocalGPT with 💻 {modelname}\n---\n🧠🫡: You are a helpful assistant.')    
48 | writehistory(logfilename,f'💻: How can I assist you today in writing?')
49 | 
50 | print("\033[95;3;6m")
51 | print("1. Waiting 10 seconds for the API to load...")
52 | # using OpenAI library to connect to Ollama API endpoint
53 | client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
54 | print(f"2. Model {modelname} loaded with OLLAMA...")
55 | print("\033[0m")  #reset all
56 | history = []
57 | print("\033[92;1m")
58 | print(f'📝Logfile: {logfilename}')
59 | 
60 | ##################### ALIGNMENT FIRST GENERATION ##############################################
61 | question = 'Explain the plot of Cinderella in a sentence.'
62 | test = [
63 |     {"role": "user", "content": question}
64 | ]
65 | 
66 | print('Question:', question)
67 | start = datetime.datetime.now()
68 | print("💻 > ", end="", flush=True)
69 | full_response = ""
70 | completion = client.chat.completions.create(
71 |     messages=test,
72 |     model='granite3-dense',
73 |     temperature=0.25,
74 |     frequency_penalty  = 1.178,
75 |     stop=stops,
76 |     max_tokens=1500,
77 |     stream=True
78 | )
79 | for chunk in completion:
80 |     try:
81 |         if chunk.choices[0].delta.content:
82 |             print(chunk.choices[0].delta.content, end="", flush=True)
83 |             full_response += chunk.choices[0].delta.content                              
84 |     except:
85 |         pass        
86 | delta = datetime.datetime.now() - start
87 | output = full_response
88 | print('')
89 | print("\033[91;1m")
90 | rating = 'PUT IT LATER'#input('Rate from 0 (BAD) to 5 (VERY GOOD) the quality of generation> ')
91 | print("\033[92;1m")
92 | stats = createStats(delta,question,output,rating,logfilename,'Alignment Generation')
93 | print(stats)
94 | writehistory(logfilename,f'''👨‍💻 . {question}
95 | 💻 > {output}
96 | {stats}
97 | ''')


--------------------------------------------------------------------------------
/layout.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | import datetime
 3 | from promptLibv2Qwen import countTokens, writehistory, createCatalog
 4 | from promptLibv2Qwen import genRANstring, createStats
 5 | from gradio import ChatMessage
 6 | 
 7 | 
 8 | ## PREPARING FINAL DATASET
 9 | 
10 | pd_id = []
11 | pd_task = []
12 | pd_vote = []
13 | pd_remarks = []
14 | test_progress = 0
15 | history = []
16 | tasks = createCatalog()
17 | # fizing issue on dipsplaying avatars 
18 | # https://www.gradio.app/guides/custom-CSS-and-JS
19 | # https://github.com/gradio-app/gradio/issues/9702
20 | custom_css = """
21 |            
22 |             .message-row img {
23 |                 margin: 0px !important;
24 |             }
25 | 
26 |             .avatar-container img {
27 |             padding: 0px !important;
28 | }
29 |         """
30 | 
31 | def generate_response(history):
32 |     history.append(
33 |         ChatMessage(role="user",
34 |                     content="Hi, my name is Fabio, a Medium writer. Who are you?")
35 |         )
36 |     history.append(
37 |         ChatMessage(role="assistant",
38 |                     content="Hi, I am your local GPT. How can I help you?")
39 |         )
40 |     return history
41 | 
42 | history = generate_response(history)
43 | with gr.Blocks(theme=gr.themes.Glass(), css=custom_css) as demo:
44 |     #TITLE SECTION
45 |     with gr.Row(variant='compact'):
46 |             with gr.Column(scale=1):            
47 |                 gr.Image(value='img/qwen.png', 
48 |                         show_label = False, 
49 |                         show_download_button = False, container = False)              
50 |             with gr.Column(scale=4):
51 |                 gr.HTML("<center>"
52 |                 + "<h1>Revised Benchmark with You as a Feedback!</h1>"
53 |                 + "<h4>💎 Qwen2.5-0.5B-it - 8K context window</h4></center>")  
54 |                 gr.Markdown("""*Run a prompt catalogue with 11 tasks*
55 |                             to validate the performances of a Small Langage Model<br>
56 |                             At the end of every generation the process will wait for the Feedback by the user<br>
57 |                             ### Fixed tuning Parameters:
58 |                             ```
59 |                             temperature = 0.25
60 |                             repeat_penalty = 1.178
61 |                             max_new_tokens = 900
62 | 
63 |                             ```
64 |                             """)
65 |     with gr.Row(variant='compact'): # Progress status
66 |             with gr.Column(scale=1):
67 |                 btn_test = gr.Button(value='Start AutoTest', variant='huggingface')
68 |                 act_task = gr.Text('', placeholder="running task..",show_label=False)
69 |             with gr.Column(scale=4):          
70 |                 actual_progress = gr.Slider(0, len(tasks), 
71 |                                             value=test_progress, label="Prompt Catalogue Progress", 
72 |                                             #info="Run the most used NLP tasks with a Language Model",
73 |                                             interactive=False)    
74 |     with gr.Row(variant='compact'): # ChatBot Area
75 |                 gr.Chatbot(history,type='messages',avatar_images=("./img/user.jpg","./img/bot.jpg"))      #
76 |     with gr.Row(variant='compact'): # Feedback from the user
77 |             with gr.Column(scale=1):
78 |                 gr.Markdown("""#### Respect this format:
79 |                             
80 |                             Put a number from 0 to 5, a space, and then your comments<br>
81 |                             ```
82 |                             5 very good one
83 |                             ```
84 |                             """)
85 |                 
86 |             with gr.Column(scale=4):          
87 |                 txt_fbck = gr.Text('', placeholder="Your evaluation feedback..",
88 |                                    label='User Feedback',lines=2)          
89 |                 btn_fbck = gr.Button(value='submit feedback', variant='huggingface')
90 | 
91 |          
92 | 
93 | if __name__ == "__main__":
94 |     demo.launch(inbrowser=True)


--------------------------------------------------------------------------------
/layout2OpenAI.py:
--------------------------------------------------------------------------------
  1 | # Chat with an intelligent assistant in your terminal  
  2 | # MODEL: ollama-granite3dense
  3 | # this wil run granite3-2B-instruct through ollamaAPI
  4 | # sources: https://github.com/fabiomatricardi/-LLM-Studies/raw/main/00.consoleAPI_stream.py
  5 | # https://github.com/fabiomatricardi/-LLM-Studies/blob/main/01.st-API-openAI_stream.py
  6 | # OLLAMA MODEL CARD: https://ollama.com/library/granite3-dense/blobs/604785e698e9
  7 | # OPenAI API for Ollama: https://github.com/ollama/ollama/blob/main/docs/openai.md
  8 | # https://github.com/ibm-granite/granite-3.0-language-models
  9 | # https://www.ibm.com/granite/docs/
 10 | # HUGGINFACE: https://huggingface.co/ibm-granite/granite-3.0-2b-instruct
 11 | #####################################################################################################
 12 | 
 13 | """
 14 | > ollama show granite3-dense
 15 |   Model
 16 |     architecture        granite
 17 |     parameters          2.6B
 18 |     context length      4096
 19 |     embedding length    2048
 20 |     quantization        Q4_K_M
 21 | 
 22 |   License
 23 |     Apache License
 24 |     Version 2.0, January 2004
 25 | """
 26 | import gradio as gr
 27 | import datetime
 28 | from promptLibv2Qwen import countTokens, writehistory, createCatalog
 29 | from promptLibv2Qwen import genRANstring, createStats
 30 | from gradio import ChatMessage
 31 | from openai import OpenAI
 32 | 
 33 | ## PREPARING FINAL DATASET
 34 | 
 35 | pd_id = []
 36 | pd_task = []
 37 | pd_vote = []
 38 | pd_remarks = []
 39 | test_progress = 0
 40 | history = []
 41 | tasks = createCatalog()
 42 | modelname = 'granite3-dense-2b'
 43 | stops = ['<|end_of_text|>']
 44 | #load client with OpenAI API toward Ollama Endpoint
 45 | client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
 46 | print(f"2. Model {modelname} loaded with OLLAMA...")
 47 | # fizing issue on dipsplaying avatars 
 48 | # https://www.gradio.app/guides/custom-CSS-and-JS
 49 | # https://github.com/gradio-app/gradio/issues/9702
 50 | custom_css = """
 51 |            
 52 |             .message-row img {
 53 |                 margin: 0px !important;
 54 |             }
 55 | 
 56 |             .avatar-container img {
 57 |             padding: 0px !important;
 58 | }
 59 |         """
 60 | 
 61 | def generate_response(history):
 62 |     history.append(
 63 |         ChatMessage(role="user",
 64 |                     content="Hi, my name is Fabio, a Medium writer. Who are you?")
 65 |         )
 66 |     history.append(
 67 |         ChatMessage(role="assistant",
 68 |                     content="Hi, I am your local GPT. How can I help you?")
 69 |         )
 70 |     return history
 71 | 
 72 | history = generate_response(history)
 73 | with gr.Blocks(theme=gr.themes.Glass(), css=custom_css) as demo:
 74 |     #TITLE SECTION
 75 |     with gr.Row(variant='compact'):
 76 |             with gr.Column(scale=1):            
 77 |                 gr.Image(value='img/qwen.png', 
 78 |                         show_label = False, 
 79 |                         show_download_button = False, container = False)              
 80 |             with gr.Column(scale=4):
 81 |                 gr.HTML("<center>"
 82 |                 + "<h1>Revised Benchmark with You as a Feedback!</h1>"
 83 |                 + "<h4>💎 Qwen2.5-0.5B-it - 8K context window</h4></center>")  
 84 |                 gr.Markdown("""*Run a prompt catalogue with 11 tasks*
 85 |                             to validate the performances of a Small Langage Model<br>
 86 |                             At the end of every generation the process will wait for the Feedback by the user<br>
 87 |                             ### Fixed tuning Parameters:
 88 |                             ```
 89 |                             temperature = 0.25
 90 |                             repeat_penalty = 1.178
 91 |                             max_new_tokens = 900
 92 | 
 93 |                             ```
 94 |                             """)
 95 |     with gr.Row(variant='compact'): # Progress status
 96 |             with gr.Column(scale=1):
 97 |                 btn_test = gr.Button(value='Start AutoTest', variant='huggingface')
 98 |                 act_task = gr.Text('', placeholder="running task..",show_label=False)
 99 |             with gr.Column(scale=4):          
100 |                 actual_progress = gr.Slider(0, len(tasks), 
101 |                                             value=test_progress, label="Prompt Catalogue Progress", 
102 |                                             #info="Run the most used NLP tasks with a Language Model",
103 |                                             interactive=False)    
104 |     with gr.Row(variant='compact'): # KpI
105 |         # with gr.Column():
106 |               txt_ttft = gr.Text('', placeholder="seconds..",
107 |                                    label='Time to first token')
108 |         # with gr.Column():
109 |               txt_gentime = gr.Text('', placeholder="TimeDelta..",
110 |                                    label='Generation Time')
111 |         # with gr.Column():
112 |               txt_speed = gr.Text('', placeholder="t/s..",
113 |                                    label='Generation Speed')
114 |         # with gr.Column():
115 |               txt_TOTtkns = gr.Text('', placeholder="tokens..",
116 |                                    label='Total num of Tokens')
117 | 
118 |     with gr.Row(variant='compact'): # ChatBot Area
119 |                 myBOT =gr.Chatbot(history,type='messages',avatar_images=("./img/user.jpg","./img/bot.jpg"))      #
120 |     with gr.Row(variant='compact'): #Temporary Area            
121 |                 temp_input = gr.Text('what is Artificial Intelligence?', 
122 |                                    label='USER',lines=1)                
123 |                 temp_ouput = gr.Text('', placeholder="Temporary Output",
124 |                                    label='BOT',lines=3)
125 |     with gr.Row(variant='compact'): # Feedback from the user
126 |             with gr.Column(scale=1):
127 |                 gr.Markdown("""#### Respect this format:
128 |                             
129 |                             Put a number from 0 to 5, a space, and then your comments<br>
130 |                             ```
131 |                             5 very good one
132 |                             ```
133 |                             """)
134 |                 
135 |             with gr.Column(scale=4):          
136 |                 txt_fbck = gr.Text('', placeholder="Your evaluation feedback..",
137 |                                    label='User Feedback',lines=2)          
138 |                 btn_fbck = gr.Button(value='submit feedback', variant='huggingface')
139 |     def update_history(history,a,b):
140 |         history.append(
141 |             ChatMessage(role="user",
142 |                         content=a)
143 |             )
144 |         history.append(
145 |             ChatMessage(role="assistant",
146 |                         content=b)
147 |             )
148 |         return history    
149 |     
150 |     def startInference(a):
151 |         prompt = [
152 | 			{"role": "user", "content": a}
153 | 		    ]
154 |         promptTKNS = countTokens(a)
155 |         generation = ''
156 |         fisrtround=0
157 |         start = datetime.datetime.now()
158 |         completion = client.chat.completions.create(
159 |             messages=prompt,
160 |             model='granite3-dense',
161 |             temperature=0.25,
162 |             frequency_penalty  = 1.178,
163 |             stop=stops,
164 |             max_tokens=1500,
165 |             stream=True            
166 |         )
167 |         for chunk in completion:
168 |             try:
169 |                 if chunk.choices[0].delta.content:
170 |                     if fisrtround==0:
171 |                         generation += chunk.choices[0].delta.content
172 |                         ttftoken = datetime.datetime.now() - start 
173 |                         secondsTTFT =  ttftoken.total_seconds()
174 |                         ttFT = f"TimeToFristToken: {secondsTTFT:.2f} sec"
175 |                         fisrtround = 1
176 |                     else:
177 |                         generation += chunk.choices[0].delta.content                              
178 |             except:
179 |                 pass  
180 |             answrTKN = countTokens(generation)
181 |             totTKN = promptTKNS + answrTKN
182 |             total_tokens = f"Total Tkns: {totTKN}"
183 |             delta = datetime.datetime.now() - start
184 |             seconds = delta.total_seconds()
185 |             speed = totTKN/seconds
186 |             speed_tokens = f"Gen Speed: {speed:.2f} t/s"
187 |             yield generation, delta, speed_tokens, ttFT,total_tokens         
188 | 
189 |     btn_test.click(startInference, inputs=[temp_input], 
190 |               outputs=[temp_ouput,txt_gentime,txt_speed,txt_ttft,txt_TOTtkns]).then(
191 |               update_history,[myBOT,temp_input,temp_ouput],myBOT     
192 |               ) 
193 | 
194 | if __name__ == "__main__":
195 |     demo.launch(inbrowser=True)


--------------------------------------------------------------------------------
/layout3Ollama-Granite.py:
--------------------------------------------------------------------------------
  1 | # Chat with an intelligent assistant in your terminal  
  2 | # MODEL: ollama-granite3dense
  3 | # this wil run granite3-2B-instruct through ollamaAPI
  4 | # sources: https://github.com/fabiomatricardi/-LLM-Studies/raw/main/00.consoleAPI_stream.py
  5 | # https://github.com/fabiomatricardi/-LLM-Studies/blob/main/01.st-API-openAI_stream.py
  6 | # OLLAMA MODEL CARD: https://ollama.com/library/granite3-dense/blobs/604785e698e9
  7 | # OPenAI API for Ollama: https://github.com/ollama/ollama/blob/main/docs/openai.md
  8 | # https://github.com/ibm-granite/granite-3.0-language-models
  9 | # https://www.ibm.com/granite/docs/
 10 | # HUGGINFACE: https://huggingface.co/ibm-granite/granite-3.0-2b-instruct
 11 | #####################################################################################################
 12 | 
 13 | """
 14 | > ollama show granite3-dense
 15 |   Model
 16 |     architecture        granite
 17 |     parameters          2.6B
 18 |     context length      4096
 19 |     embedding length    2048
 20 |     quantization        Q4_K_M
 21 | 
 22 |   License
 23 |     Apache License
 24 |     Version 2.0, January 2004
 25 | """
 26 | import gradio as gr
 27 | import datetime
 28 | from promptLibv2Qwen import countTokens, writehistory, createCatalog
 29 | from promptLibv2Qwen import genRANstring, createStats
 30 | from gradio import ChatMessage
 31 | from openai import OpenAI
 32 | 
 33 | ## PREPARING FINAL DATASET
 34 | 
 35 | pd_id = []
 36 | pd_task = []
 37 | pd_vote = []
 38 | pd_remarks = []
 39 | test_progress = 0
 40 | history = []
 41 | tasks = createCatalog()
 42 | modelname = 'granite3-dense-2b'
 43 | stops = ['<|end_of_text|>']
 44 | #load client with OpenAI API toward Ollama Endpoint
 45 | client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
 46 | print(f"2. Model {modelname} loaded with OLLAMA...")
 47 | # fizing issue on dipsplaying avatars 
 48 | # https://www.gradio.app/guides/custom-CSS-and-JS
 49 | # https://github.com/gradio-app/gradio/issues/9702
 50 | custom_css = """
 51 |            
 52 |             .message-row img {
 53 |                 margin: 0px !important;
 54 |             }
 55 | 
 56 |             .avatar-container img {
 57 |             padding: 0px !important;
 58 | }
 59 |         """
 60 | 
 61 | def generate_response(history):
 62 |     history.append(
 63 |         ChatMessage(role="user",
 64 |                     content="Hi, my name is Fabio, a Medium writer. Who are you?")
 65 |         )
 66 |     history.append(
 67 |         ChatMessage(role="assistant",
 68 |                     content="Hi, I am your local GPT. How can I help you?")
 69 |         )
 70 |     return history
 71 | 
 72 | history = generate_response(history)
 73 | with gr.Blocks(theme=gr.themes.Glass(), css=custom_css) as demo:
 74 |     #TITLE SECTION
 75 |     with gr.Row(variant='compact'):
 76 |             with gr.Column(scale=1):            
 77 |                 gr.Image(value='img/granite3.png', 
 78 |                         show_label = False, 
 79 |                         show_download_button = False, container = False)              
 80 |             with gr.Column(scale=4):
 81 |                 gr.HTML("<center>"
 82 |                 + "<h1>Revised Benchmark with You as a Feedback!</h1>"
 83 |                 + "<h4>💎 granite3-dense-2B - 4K context window with Ollama engine</h4></center>")  
 84 |                 gr.Markdown("""*Run a prompt catalogue with 11 tasks*
 85 |                             to validate the performances of a Small Langage Model<br>
 86 |                             At the end of every generation the process will wait for the Feedback by the user<br>
 87 |                             ### Fixed tuning Parameters:
 88 |                             ```
 89 |                             temperature = 0.25
 90 |                             repeat_penalty = 1.178
 91 |                             max_new_tokens = 900
 92 | 
 93 |                             ```
 94 |                             """)
 95 |     with gr.Row(variant='compact'): # Progress status
 96 |             with gr.Column(scale=1):
 97 |                 btn_test = gr.Button(value='Start AutoTest', variant='huggingface')
 98 |                 act_task = gr.Text('', placeholder="running task..",show_label=False)
 99 |             with gr.Column(scale=4):          
100 |                 actual_progress = gr.Slider(0, len(tasks), 
101 |                                             value=test_progress, label="Prompt Catalogue Progress", 
102 |                                             #info="Run the most used NLP tasks with a Language Model",
103 |                                             interactive=False)    
104 |     with gr.Row(variant='compact'): # KpI
105 |         # with gr.Column():
106 |               txt_ttft = gr.Text('', placeholder="seconds..",
107 |                                    label='Time to first token')
108 |         # with gr.Column():
109 |               txt_gentime = gr.Text('', placeholder="TimeDelta..",
110 |                                    label='Generation Time')
111 |         # with gr.Column():
112 |               txt_speed = gr.Text('', placeholder="t/s..",
113 |                                    label='Generation Speed')
114 |         # with gr.Column():
115 |               txt_TOTtkns = gr.Text('', placeholder="tokens..",
116 |                                    label='Total num of Tokens')
117 | 
118 |     with gr.Row(variant='compact'): # ChatBot Area
119 |                 myBOT =gr.Chatbot(history,type='messages',avatar_images=("./img/user.jpg","./img/bot.jpg"))      #
120 |     
121 |     with gr.Row(variant='compact'): #Temporary Area            
122 |                 temp_input = gr.Text('what is Artificial Intelligence?', 
123 |                                    label='ACTUAL TASK PROMPT',lines=2)                
124 | 
125 |     with gr.Row(variant='compact'): # Feedback from the user
126 |             with gr.Column(scale=1):
127 |                 gr.Markdown("""#### Respect this format:
128 |                             
129 |                             Put a number from 0 to 5, a space, and then your comments<br>
130 |                             ```
131 |                             5 very good one
132 |                             ```
133 |                             """)
134 |                 
135 |             with gr.Column(scale=4):          
136 |                 txt_fbck = gr.Text('', placeholder="Your evaluation feedback..",
137 |                                    label='User Feedback',lines=2)          
138 |                 btn_fbck = gr.Button(value='submit feedback', variant='huggingface')
139 |     
140 |     def update_history(history,a):
141 |         history.append(
142 |             ChatMessage(role="user",
143 |                         content=a)
144 |             )
145 |         return history    
146 |     
147 |     def startInference(a):
148 |         print(a[-1]['content'])
149 |         prompt = [
150 | 			{"role": "user", "content": a[-1]['content']}
151 | 		    ]
152 |         promptTKNS = countTokens(a[-1]['content'])
153 |         a.append({"role":"assistant","content":''})
154 |         generation = ''
155 |         fisrtround=0
156 |         start = datetime.datetime.now()
157 |         completion = client.chat.completions.create(
158 |             messages=prompt,
159 |             model='granite3-dense',
160 |             temperature=0.25,
161 |             frequency_penalty  = 1.178,
162 |             stop=stops,
163 |             max_tokens=900,
164 |             stream=True            
165 |         )
166 |         for chunk in completion:
167 |             try:
168 |                 if chunk.choices[0].delta.content:
169 |                     if fisrtround==0:
170 |                         a[-1]['content'] += chunk.choices[0].delta.content
171 |                         ttftoken = datetime.datetime.now() - start 
172 |                         secondsTTFT =  ttftoken.total_seconds()
173 |                         ttFT = f"TimeToFristToken: {secondsTTFT:.2f} sec"
174 |                         fisrtround = 1
175 |                     else:
176 |                         a[-1]['content'] += chunk.choices[0].delta.content                              
177 |             except:
178 |                 pass  
179 |             answrTKN = countTokens(a[-1]['content'])
180 |             totTKN = promptTKNS + answrTKN
181 |             total_tokens = f"Total Tkns: {totTKN}"
182 |             delta = datetime.datetime.now() - start
183 |             seconds = delta.total_seconds()
184 |             speed = totTKN/seconds
185 |             speed_tokens = f"Gen Speed: {speed:.2f} t/s"
186 |             yield a, delta, speed_tokens, ttFT,total_tokens         
187 | 
188 |     btn_test.click(update_history, inputs=[myBOT,temp_input], 
189 |               outputs=[myBOT]).then(startInference,[myBOT],[myBOT,txt_gentime,txt_speed,txt_ttft,txt_TOTtkns])
190 | 
191 |     def startloop():
192 |          #rasie flag to wait
193 |          #start the loop
194 |          #display task
195 |          #append chatbot prompt
196 |          #start the generation
197 |          pass
198 | 
199 | 
200 | if __name__ == "__main__":
201 |     demo.launch(inbrowser=True)


--------------------------------------------------------------------------------
/promptLibv2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | V2 changes
  3 | added Time To First Token in the statistics ttft
  4 | added some more prompts in the catalog
  5 | - say 'I am ready'
  6 | - modified for Llama3.2-1b Write in a list the three main key points -  format output
  7 | 
  8 | 20240929 FAMA
  9 | """
 10 | 
 11 | import random
 12 | import string
 13 | import tiktoken
 14 | 
 15 | def createCatalog():
 16 |     """
 17 |     Create a dictionary with 
 18 |     'task'   : description of the NLP task in the prompt
 19 |     'prompt' : the instruction prompt for the LLM
 20 |     """
 21 |     context = """One of the things everybody in the West knows about China is that it is not a democracy, and is instead a regime run with an iron fist by a single entity, the Chinese Communist Party, whose leadership rarely acts transparently, running the country without the need for primary elections, alternative candidacies, etc.
 22 | In general, those of us who live in democracies, with relatively transparent electoral processes, tend to consider the Chinese system undesirable, little more than a dictatorship where people have no say in who governs them.
 23 | That said, among the “advantages” of the Chinese system is that because the leadership never has to put its legitimacy to the vote, it can carry out very long-term planning in the knowledge that another administration isn’t going to come along and change those plans.
 24 | Obviously, I put “advantages” in quotation marks because, as democrats, most of my readers would never be willing to sacrifice their freedom for greater planning, but there is no doubt that China, since its system works like this and its population seems to have accepted it for generations, intends to turn this into a comparative advantage, the term used in business when analyzing companies.
 25 | It turns out that China’s capacity for long-term planning is achieving something unheard of in the West: it seems the country reached peak carbon dioxide and greenhouse gas emissions in 2023, and that the figures for 2024, driven above all by a determined increase in the installation of renewable energies, are not only lower, but apparently going to mark a turning point.
 26 | China and India were until recently the planet’s biggest polluters, but they now offer a model for energy transition (there is still a long way to go; but we are talking about models, not a done deal).
 27 | It could soon be the case that the so-called developing countries will be showing the West the way forward."""
 28 |     catalog = []
 29 |     prmpt_tasks = ["introduction",
 30 |                "explain in one sentence",
 31 |                "explain in three paragraphs",
 32 |                "say 'I am ready'",
 33 |                "summarize",
 34 |                "Summarize in two sentences",
 35 |                "Write in a list the three main key points -  format output",
 36 |                "Table of Contents",
 37 |                "RAG",
 38 |                "Truthful RAG",
 39 |                "write content from a reference",
 40 |                "extract 5 topics",
 41 |                "Creativity: 1000 words SF story",
 42 |                "Reflection prompt"
 43 |                ]
 44 |     prmpt_coll = [
 45 | """Hi there I am Fabio, a Medium writer. who are you?""",
 46 | """explain in one sentence what is science.\n""",
 47 | """explain in three paragraphs what is artificial intelligence.\n""",
 48 | f"""read the following text and when you are done say "I am ready".
 49 | 
 50 | [text]
 51 | {context}
 52 | [end of text]
 53 | 
 54 | """,
 55 | f"""summarize the following text:
 56 | 
 57 | [text]
 58 | {context}
 59 | [end of text]
 60 | 
 61 | """,
 62 | f"""Summarize in two sentences the following text
 63 | 
 64 | [text]
 65 | {context}
 66 | [end of text]
 67 | 
 68 | """,
 69 | f"""1. extract the three key points from the provided text
 70 | 2. format the output as a python list ["point 1","point 2", "point 3"]
 71 | 
 72 | [text]
 73 | {context}
 74 | [end of text]
 75 | 
 76 | python list:
 77 | 
 78 | """,
 79 | f"""A "table of content" is an ordered list of the topic contained in the text: write the "Table of Contents" of the following text. 
 80 | 
 81 | [text]
 82 | {context}
 83 | [end of text]
 84 | 
 85 | """,
 86 | f"""Reply to the question only using the provided context. If the answer is not contained in the text say "unanswerable".
 87 | 
 88 | question: what China achieved with it's long-term planning?
 89 | 
 90 | [context]
 91 | {context}
 92 | [end of context]
 93 | 
 94 | answer:
 95 | """,
 96 | f"""Reply to the question only using the provided context. If the answer is not contained in the provided context say "unanswerable".
 97 | 
 98 | question: who is Anne Frank?
 99 | 
100 | [context]
101 | {context}
102 | [end of context]
103 | 
104 | Remember: if you cannot answer based on the provided context, say "unanswerable"
105 | 
106 | answer:
107 | """, 
108 | 
109 | f"""Using the following text as a reference, write a 5-paragraphs essay about "the benefits of China economic model".
110 | 
111 | [text]
112 | {context}
113 | [end of text]
114 | 
115 | """,
116 | f"""write the five most important topics from the following text:
117 | 
118 | [text]
119 | {context}
120 | [end of text]
121 | 
122 | """,
123 | """Science Fiction: The Last Transmission - Write a story that takes place entirely within a spaceship's cockpit as the sole surviving crew member attempts to send a final message back to Earth before the ship's power runs out. The story should explore themes of isolation, sacrifice, and the importance of human connection in the face of adversity. 800-1000 words.
124 | 
125 | """,
126 | """You are an AI assistant designed to provide detailed, step-by-step responses. Your outputs should follow this structure:
127 | 1. Begin with a <thinking> section.
128 | 2. Inside the thinking section:
129 |    a. Briefly analyze the question and outline your approach.
130 |    b. Present a clear plan of steps to solve the problem.
131 |    c. Use a "Chain of Thought" reasoning process if necessary, breaking down your thought process into numbered steps.
132 | 3. Include a <reflection> section for each idea where you:
133 |    a. Review your reasoning.
134 |    b. Check for potential errors or oversights.
135 |    c. Confirm or adjust your conclusion if necessary.
136 | 4. Be sure to close all reflection sections.
137 | 5. Close the thinking section with </thinking>.
138 | 6. Provide your final answer in an <output> section.
139 | Always use these tags in your responses. Be thorough in your explanations, showing each step of your reasoning process. Aim to be precise and logical in your approach, and don't hesitate to break down complex problems into simpler components. Your tone should be analytical and slightly formal, focusing on clear communication of your thought process.
140 | Remember: Both <thinking> and <reflection> MUST be tags and must be closed at their conclusion
141 | Make sure all <tags> are on separate lines with no other text. Do not include other text on a line containing a tag.
142 | 
143 | user question: explain why it is crucial for teachers to learn how to use generative AI for their job and for the future of education. Include relevant learning path for teachers and educators. 
144 | 
145 | """
146 | ]
147 |     for i in range(0,len(prmpt_tasks)):
148 |         catalog.append({'task':prmpt_tasks[i],
149 |                         'prompt': prmpt_coll[i]})
150 |     return catalog
151 | 
152 | def countTokens(text):
153 |     """
154 |     Use tiktoken to count the number of tokens
155 |     text -> str input
156 |     Return -> int number of tokens counted
157 |     """
158 |     encoding = tiktoken.get_encoding("r50k_base") #context_count = len(encoding.encode(yourtext))
159 |     numoftokens = len(encoding.encode(text))
160 |     return numoftokens
161 | 
162 | def writehistory(filename,text):
163 |     """
164 |     save a string into a logfile with python file operations
165 |     filename -> str pathfile/filename
166 |     text -> str, the text to be written in the file
167 |     """
168 |     with open(f'{filename}', 'a', encoding='utf-8') as f:
169 |         f.write(text)
170 |         f.write('\n')
171 |     f.close()
172 | 
173 | def genRANstring(n):
174 |     """
175 |     n = int number of char to randomize
176 |     """
177 |     N = n
178 |     res = ''.join(random.choices(string.ascii_uppercase +
179 |                                 string.digits, k=N))
180 |     return res
181 | 
182 | def createStats(delta,question,output,rating,logfilename,task,ttft):
183 |     """
184 |     Takes in all the generation main info and return KPIs
185 |     delta -> datetime.now() delta
186 |     question -> str the user input to the LLM
187 |     output -> str the generation from the LLM
188 |     rating -> str human eval feedback rating
189 |     logfilename -> str filepath/filename
190 |     task -> str description of the NLP task describing the prompt
191 |     ttft -> datetime.now() delta time to first token
192 |     """
193 |     totalseconds = delta.total_seconds()
194 |     prompttokens = countTokens(question)
195 |     assistanttokens = countTokens(output)
196 |     totaltokens = prompttokens + assistanttokens
197 |     speed = totaltokens/totalseconds
198 |     genspeed = assistanttokens/totalseconds
199 |     ttofseconds = ttft.total_seconds()
200 |     stats = f'''---
201 | Prompt Tokens: {prompttokens}
202 | Output Tokens: {assistanttokens}
203 | TOTAL Tokens: {totaltokens}
204 | >>>⏱️ Time to First Token: {ttofseconds} seconds
205 | >>>⏱️ Inference time:   {delta}
206 | >>>🧮 Inference speed:  {speed:.3f}  t/s
207 | >>>🏍️ Generation speed: {genspeed:.3f}  t/s
208 | >>>📝 Logfile:     {logfilename}
209 | >>>💚 User rating: {rating}
210 | >>>✅ NLP TAKS:    {task}
211 | '''
212 |     return stats


--------------------------------------------------------------------------------
/promptLibv2Qwen.py:
--------------------------------------------------------------------------------
  1 | """
  2 | V2 changes
  3 | added Time To First Token in the statistics ttft
  4 | added some more prompts in the catalog
  5 | - say 'I am ready'
  6 | - modified for Llama3.2-1b Write in a list the three main key points -  format output
  7 | 
  8 | 20240929 FAMA
  9 | """
 10 | 
 11 | import random
 12 | import string
 13 | import tiktoken
 14 | 
 15 | def createCatalog():
 16 |     """
 17 |     Create a dictionary with 
 18 |     'task'   : description of the NLP task in the prompt
 19 |     'prompt' : the instruction prompt for the LLM
 20 |     """
 21 |     context = """One of the things everybody in the West knows about China is that it is not a democracy, and is instead a regime run with an iron fist by a single entity, the Chinese Communist Party, whose leadership rarely acts transparently, running the country without the need for primary elections, alternative candidacies, etc.
 22 | In general, those of us who live in democracies, with relatively transparent electoral processes, tend to consider the Chinese system undesirable, little more than a dictatorship where people have no say in who governs them.
 23 | That said, among the “advantages” of the Chinese system is that because the leadership never has to put its legitimacy to the vote, it can carry out very long-term planning in the knowledge that another administration isn’t going to come along and change those plans.
 24 | Obviously, I put “advantages” in quotation marks because, as democrats, most of my readers would never be willing to sacrifice their freedom for greater planning, but there is no doubt that China, since its system works like this and its population seems to have accepted it for generations, intends to turn this into a comparative advantage, the term used in business when analyzing companies.
 25 | It turns out that China’s capacity for long-term planning is achieving something unheard of in the West: it seems the country reached peak carbon dioxide and greenhouse gas emissions in 2023, and that the figures for 2024, driven above all by a determined increase in the installation of renewable energies, are not only lower, but apparently going to mark a turning point.
 26 | China and India were until recently the planet’s biggest polluters, but they now offer a model for energy transition (there is still a long way to go; but we are talking about models, not a done deal).
 27 | It could soon be the case that the so-called developing countries will be showing the West the way forward."""
 28 |     catalog = []
 29 |     prmpt_tasks = ["introduction",
 30 |                "explain in one sentence",
 31 |                "explain in three paragraphs",
 32 |                "say 'I am ready'",
 33 |                "summarize",
 34 |                "Summarize in two sentences",
 35 |                "Write in a list the three main key points -  format output",
 36 |                "Table of Contents",
 37 |                "RAG",
 38 |                "Truthful RAG",
 39 |                "write content from a reference",
 40 |                "extract 5 topics",
 41 |                "Creativity: 1000 words SF story",
 42 |                "Reflection prompt"
 43 |                ]
 44 |     prmpt_coll = [
 45 | """Hi there I am Fabio, a Medium writer. who are you?""",
 46 | """explain in one sentence what is science.\n""",
 47 | """explain only in three paragraphs what is artificial intelligence.\n""",
 48 | f"""read the following text and when you are done say "I am ready".
 49 | 
 50 | [text]
 51 | {context}
 52 | [end of text]
 53 | 
 54 | """,
 55 | f"""summarize the following text:
 56 | [text]
 57 | {context}
 58 | [end of text]
 59 | write the summary using the information provided.
 60 | 
 61 | Summary:
 62 | """,
 63 | f"""Write a two-sentences summary of the following text:
 64 | [text]
 65 | {context}
 66 | [end of text]
 67 | Remember: write only the summary with the text provided.
 68 | """,
 69 | f"""1. extract the three key points from the provided text
 70 | 2. format the output as a python list ["point 1","point 2", "point 3"]
 71 | [text]
 72 | {context}
 73 | [end of text]
 74 | Return only the python list.
 75 | """,
 76 | f"""A "table of content" provides an organized structure to the text, allowing readers quick access to key points. Write the "table of content" of the following text. 
 77 | [text]
 78 | {context}
 79 | [end of text]
 80 | 
 81 | "table of content":
 82 | """,
 83 | f"""Reply to the question only using the provided context. If the answer is not contained in the text say "unanswerable".
 84 | 
 85 | question: what China achieved with it's long-term planning?
 86 | 
 87 | [context]
 88 | {context}
 89 | [end of context]
 90 | 
 91 | answer:
 92 | """,
 93 | f"""Reply to the question only using the provided context. If the answer is not contained in the provided context say "unanswerable".
 94 | 
 95 | question: who is Anne Frank?
 96 | 
 97 | [context]
 98 | {context}
 99 | [end of context]
100 | 
101 | Remember: if you cannot answer based on the provided context, say "unanswerable"
102 | 
103 | answer:
104 | """, 
105 | 
106 | f"""Using the following text as a reference, write a 5-paragraphs essay about "the benefits of China economic model".
107 | 
108 | [text]
109 | {context}
110 | [end of text]
111 | Remember: use the information provided and write exactly 5 paragraphs.
112 | """,
113 | f"""List five most important topics from the following text:
114 | [text]
115 | {context}
116 | [end of text]
117 | """,
118 | """Science Fiction: The Last Transmission - Write a story that takes place entirely within a spaceship's cockpit as the sole surviving crew member attempts to send a final message back to Earth before the ship's power runs out. The story should explore themes of isolation, sacrifice, and the importance of human connection in the face of adversity. 800-1000 words.
119 | 
120 | """,
121 | """You are an AI assistant designed to provide detailed, step-by-step responses. Your outputs should follow this structure:
122 | 1. Begin with a <thinking> section.
123 | 2. Inside the thinking section:
124 |    a. Briefly analyze the question and outline your approach.
125 |    b. Present a clear plan of steps to solve the problem.
126 |    c. Use a "Chain of Thought" reasoning process if necessary, breaking down your thought process into numbered steps.
127 | 3. Include a <reflection> section for each idea where you:
128 |    a. Review your reasoning.
129 |    b. Check for potential errors or oversights.
130 |    c. Confirm or adjust your conclusion if necessary.
131 | 4. Be sure to close all reflection sections.
132 | 5. Close the thinking section with </thinking>.
133 | 6. Provide your final answer in an <output> section.
134 | Always use these tags in your responses. Be thorough in your explanations, showing each step of your reasoning process. Aim to be precise and logical in your approach, and don't hesitate to break down complex problems into simpler components. Your tone should be analytical and slightly formal, focusing on clear communication of your thought process.
135 | Remember: Both <thinking> and <reflection> MUST be tags and must be closed at their conclusion
136 | Make sure all <tags> are on separate lines with no other text. Do not include other text on a line containing a tag.
137 | 
138 | user question: explain why it is crucial for teachers to learn how to use generative AI for their job and for the future of education. Include relevant learning path for teachers and educators. 
139 | 
140 | """
141 | ]
142 |     for i in range(0,len(prmpt_tasks)):
143 |         catalog.append({'task':prmpt_tasks[i],
144 |                         'prompt': prmpt_coll[i]})
145 |     return catalog
146 | 
147 | def countTokens(text):
148 |     """
149 |     Use tiktoken to count the number of tokens
150 |     text -> str input
151 |     Return -> int number of tokens counted
152 |     """
153 |     encoding = tiktoken.get_encoding("cl100k_base") #context_count = len(encoding.encode(yourtext))
154 |     numoftokens = len(encoding.encode(text))
155 |     return numoftokens
156 | 
157 | def writehistory(filename,text):
158 |     """
159 |     save a string into a logfile with python file operations
160 |     filename -> str pathfile/filename
161 |     text -> str, the text to be written in the file
162 |     """
163 |     with open(f'{filename}', 'a', encoding='utf-8') as f:
164 |         f.write(text)
165 |         f.write('\n')
166 |     f.close()
167 | 
168 | def genRANstring(n):
169 |     """
170 |     n = int number of char to randomize
171 |     """
172 |     N = n
173 |     res = ''.join(random.choices(string.ascii_uppercase +
174 |                                 string.digits, k=N))
175 |     return res
176 | 
177 | def createStats(delta,question,output,rating,logfilename,task,ttft):
178 |     """
179 |     Takes in all the generation main info and return KPIs
180 |     delta -> datetime.now() delta
181 |     question -> str the user input to the LLM
182 |     output -> str the generation from the LLM
183 |     rating -> str human eval feedback rating
184 |     logfilename -> str filepath/filename
185 |     task -> str description of the NLP task describing the prompt
186 |     ttft -> datetime.now() delta time to first token
187 |     """
188 |     totalseconds = delta.total_seconds()
189 |     prompttokens = countTokens(question)
190 |     assistanttokens = countTokens(output)
191 |     totaltokens = prompttokens + assistanttokens
192 |     speed = totaltokens/totalseconds
193 |     genspeed = assistanttokens/totalseconds
194 |     ttofseconds = ttft.total_seconds()
195 |     stats = f'''---
196 | Prompt Tokens: {prompttokens}
197 | Output Tokens: {assistanttokens}
198 | TOTAL Tokens: {totaltokens}
199 | >>>⏱️ Time to First Token: {ttofseconds} seconds
200 | >>>⏱️ Inference time:   {delta}
201 | >>>🧮 Inference speed:  {speed:.3f}  t/s
202 | >>>🏍️ Generation speed: {genspeed:.3f}  t/s
203 | >>>📝 Logfile:     {logfilename}
204 | >>>💚 User rating: {rating}
205 | >>>✅ NLP TAKS:    {task}
206 | '''
207 |     return stats


--------------------------------------------------------------------------------
/usefulResources.md:
--------------------------------------------------------------------------------
  1 | # About Gradio
  2 | Issues on avatars in the chatbot
  3 | - [https://www.gradio.app/guides/custom-CSS-and-JS](https://www.gradio.app/guides/custom-CSS-and-JS)
  4 | - [https://github.com/gradio-app/gradio/issues/9702](https://github.com/gradio-app/gradio/issues/9702)
  5 | - [Gradio Theming](https://www.gradio.app/guides/theming-guide)
  6 | ```
  7 | gr.themes.Base() - the "base" theme sets the primary color to blue but otherwise has minimal styling, making it particularly useful as a base for creating new, custom themes.
  8 | gr.themes.Default() - the "default" Gradio 5 theme, with a vibrant orange primary color and gray secondary color.
  9 | gr.themes.Origin() - the "origin" theme is most similar to Gradio 4 styling. Colors, especially in light mode, are more subdued than the Gradio 5 default theme.
 10 | gr.themes.Citrus() - the "citrus" theme uses a yellow primary color, highlights form elements that are in focus, and includes fun 3D effects when buttons are clicked.
 11 | gr.themes.Monochrome() - the "monochrome" theme uses a black primary and white secondary color, and uses serif-style fonts, giving the appearance of a black-and-white newspaper.
 12 | gr.themes.Soft() - the "soft" theme uses a purpose primary color and white secondary color. It also increases the border radii and around buttons and form elements and highlights labels.
 13 | gr.themes.Glass() - the "glass" theme has a blue primary color and a transclucent gray secondary color. The theme also uses vertical gradients to create a glassy effect.
 14 | gr.themes.Ocean() - the "ocean" theme has a blue-green primary color and gray secondary color. The theme also uses horizontal gradients, especially for buttons and some form elements.
 15 | ```
 16 | - [Gradio Theme Gallery](https://huggingface.co/spaces/gradio/theme-gallery)
 17 | - [GUIDE How to Create a Chatbot with Gradio](https://www.gradio.app/guides/creating-a-chatbot-fast)
 18 | 
 19 | ### Newspaper3k: Article scraping & curation
 20 | - [https://github.com/codelucas/newspaper](https://github.com/codelucas/newspaper)
 21 | - [https://github.com/fabiomatricardi/MetadataIsAllYouNeed/blob/main/KeyBERT_gr.py](https://github.com/fabiomatricardi/MetadataIsAllYouNeed/blob/main/KeyBERT_gr.py)
 22 | 
 23 | 
 24 | ### General inference rules
 25 | The Mistral models allows you to chat with a model that has been fine-tuned to follow instructions and respond to natural language prompts. A prompt is the input that you provide to the Mistral model. It can come in various forms, such as asking a question, giving an instruction, or providing a few examples of the task you want the model to perform. Based on the prompt, the Mistral model generates a text output as a response.
 26 | The chat completion API accepts a list of chat messages as input and generates a response. This response is in the form of a new chat message with the role "assistant" as output.
 27 | - [https://docs.mistral.ai/capabilities/completion/](https://docs.mistral.ai/capabilities/completion/)
 28 | - 
 29 | 
 30 | # Models to try
 31 | ### H2OVL-Mississippi-800M
 32 | The H2OVL-Mississippi-800M is a compact yet powerful vision-language model from H2O.ai, featuring 0.8 billion parameters. Despite its small size, it delivers state-of-the-art performance in text recognition, excelling in the Text Recognition segment of OCRBench and outperforming much larger models in this domain. Built upon the robust architecture of our H2O-Danube language models, the Mississippi-800M extends their capabilities by seamlessly integrating vision and language tasks.
 33 | 
 34 | <img src='https://huggingface.co/h2oai/h2ovl-mississippi-800m/resolve/main/assets/text_recognition.png' width=500>
 35 | - [HuggingFace repo at H2O](https://huggingface.co/h2oai/h2ovl-mississippi-800m)
 36 | - [H2O article by Asghar Ghorbani](https://h2o.ai/blog/2024/document-classification-with-h2o-vl-mississippi--a-quick-guide/)
 37 | ### MaziyarPanahi's Collections
 38 | 🚀GGUF Llama.cpp compatible models, can be used on CPUs and GPUs!<br>
 39 | Here the [AMAZING COLLECTION](https://huggingface.co/collections/MaziyarPanahi/gguf-65afc99c3997c4b6d2d9e1d5)
 40 | ### h2oai/deberta_finetuned_pii
 41 | A finetuned model designed to recognize and classify Personally Identifiable Information (PII) within unstructured text data. This powerful model accurately identifies a wide range of PII categories, such as account names, credit card numbers, emails, phone numbers, and addresses. The model is specifically trained to detect various PII types, including but not limited to:
 42 | 
 43 | ```
 44 | | Category               | Data                                                                                   |
 45 | |------------------------|----------------------------------------------------------------------------------------|
 46 | | Account-related information | Account name, account number, and transaction amounts                             |
 47 | | Banking details        | BIC, IBAN, and Bitcoin or Ethereum addresses                                           |
 48 | | Personal information   | Full name, first name, middle name, last name, gender, and date of birth               |
 49 | | Contact information    | Email, phone number, and street address (including building number, city, county, state, and zip code) |
 50 | | Job-related data       | Job title, job area, job descriptor, and job type                                      |
 51 | | Financial data         | Credit card number, issuer, CVV, and currency information (code, name, and symbol)     |
 52 | | Digital identifiers    | IP addresses (IPv4 and IPv6), MAC addresses, and user agents                           |
 53 | | Online presence        | URL, usernames, and passwords                                                          |
 54 | | Other sensitive data   | SSN, vehicle VIN and VRM, phone IMEI, and nearby GPS coordinates                       |
 55 | 
 56 | ```
 57 | 
 58 | The PII Identifier Model ensures data privacy and compliance by effectively detecting and categorizing sensitive information within documents, emails, user-generated content, and more. Make your data processing safer and more secure with our state-of-the-art PII detection technology.
 59 | - [Hugging Face Repo](https://huggingface.co/h2oai/deberta_finetuned_pii)
 60 | - 
 61 | 
 62 | 
 63 | 
 64 | # Large Language Models course free
 65 | - [llm-engineering-handbook](https://github.com/aofoegbu/llm-engineers-handbook)
 66 | - [DeepLearning.AI course on Agents](https://learn.deeplearning.ai/courses/practical-multi-ai-agents-and-advanced-use-cases-with-crewai/lesson/1/introduction)
 67 | - [Microsoft BitNet.cpp](https://github.com/microsoft/BitNet)
 68 | - [ArXiv paper *1-bit AI Infra: Part 1.1, Fast and Lossless BitNet b1.58 Inference on CPUs*](https://arxiv.org/abs/2410.16144v1)
 69 | - [MotleyCrew AI](https://motleycrew.ai/)
 70 | - [MotleyCrewAI-readthedocs](https://motleycrew.readthedocs.io/en/latest/quickstart.html)
 71 | - [OpenVino quick guide CheatSheet](https://docs.openvino.ai/2024/_static/download/OpenVINO_Quick_Start_Guide.pdf)
 72 | - [OpenVino Toolkit Getting Started](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/get-started.html)
 73 | 
 74 | # About IBM/Granite  models
 75 | - [Ollama serving](https://ollama.com/library/granite3-moe:1b)
 76 | - [Granite3 IbM on GitHub](https://github.com/ibm-granite/granite-3.0-language-models)
 77 | -  [Fabio you are the Benchmark](https://github.com/fabiomatricardi/YouAreTheBenchmark)
 78 | -  
 79 | 
 80 | 
 81 | ## Markdown resoources
 82 | - [MArkdown Videos](https://github.com/Snailedlt/Markdown-Videos)
 83 | - 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | ## [Universal Assisted Generation: Faster Decoding with Any Assistant Model](https://huggingface.co/blog/universal_assisted_generation)
 89 | TL;DR: Many LLMs such as gemma-2-9b and Mixtral-8x22B-Instruct-v0.1 lack a much smaller version to use for assisted generation. In this blog post, we present Universal Assisted Generation: a method developed by Intel Labs and Hugging Face which extends assisted generation to work with a small language model from any model family 🤯. As a result, it is now possible to accelerate inference from any decoder or Mixture of Experts model by 1.5x-2.0x at almost zero-cost 🔥🔥🔥!
 90 | In order to mitigate this pain point, Intel Labs, together with our friends at Hugging Face, has developed Universal Assisted Generation (UAG). UAG enables selecting any pair of target and assistant models regardless of their tokenizer. For example, gemma-2-9b can be used as the target model, with the tiny vicuna-68m as the assistant.
 91 | The main idea behind the method we propose is 2-way tokenizer translations. Once the assistant model completes a generation iteration, the assistant tokens are converted to text, which is then tokenized using the target model's tokenizer to generate target tokens. After the verification step, the target tokens are similarly converted back to the assistant tokens format, which are then appended to the assistant model's context before the next iteration begins.
 92 | Since the assistant and target tokenizers use different vocabularies it's necessary to handle the discrepancies between them. To accurately re-encode the newly generated assistant tokens, it’s essential to prepend a context window consisting of several previous tokens. This entire sequence is then re-encoded into the target token format and aligned with the most recent target tokens to pinpoint the exact location where the newly generated tokens should be appended. This process is illustrated in the video below.
 93 | 
 94 | 
 95 | https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/universal-assisted-generation/method-animation.mov
 96 | 
 97 | ```
 98 | <video src='https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/universal-assisted-generation/method-animation.mov' width=180/>
 99 | ```
100 | 
101 | ```
102 | <video width="320" height="240" controls>
103 |   <source src="[video.mov](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/universal-assisted-generation/method-animation.mov)" type="video/mp4">
104 | </video>
105 | ```
106 | References for videos in github:
107 | - [https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md](https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md)
108 | - [https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md/4279746#4279746](https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md/4279746#4279746)
109 | - [https://github.com/alelievr/Mixture/blob/0.4.0/README.md](https://github.com/alelievr/Mixture/blob/0.4.0/README.md)
110 | - [https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md](https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md)
111 | - [https://www.geeksforgeeks.org/how-to-add-videos-on-readme-md-file-in-a-github-repository/](https://www.geeksforgeeks.org/how-to-add-videos-on-readme-md-file-in-a-github-repository/)
112 | 
113 | 
114 | 
115 | Ok
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/layout4Ollama-Granite_autotest.py:
--------------------------------------------------------------------------------
  1 | # Chat with an intelligent assistant in your terminal  
  2 | # MODEL: ollama-granite3dense
  3 | # this wil run granite3-2B-instruct through ollamaAPI
  4 | # sources: https://github.com/fabiomatricardi/-LLM-Studies/raw/main/00.consoleAPI_stream.py
  5 | # https://github.com/fabiomatricardi/-LLM-Studies/blob/main/01.st-API-openAI_stream.py
  6 | # OLLAMA MODEL CARD: https://ollama.com/library/granite3-dense/blobs/604785e698e9
  7 | # OPenAI API for Ollama: https://github.com/ollama/ollama/blob/main/docs/openai.md
  8 | # https://github.com/ibm-granite/granite-3.0-language-models
  9 | # https://www.ibm.com/granite/docs/
 10 | # HUGGINFACE: https://huggingface.co/ibm-granite/granite-3.0-2b-instruct
 11 | #####################################################################################################
 12 | 
 13 | """
 14 | > ollama show granite3-dense
 15 |   Model
 16 |     architecture        granite
 17 |     parameters          2.6B
 18 |     context length      4096
 19 |     embedding length    2048
 20 |     quantization        Q4_K_M
 21 | 
 22 |   License
 23 |     Apache License
 24 |     Version 2.0, January 2004
 25 | 
 26 | TO DO
 27 | =====
 28 | - clear the comment section upon submission
 29 | - disable main START autotest button after first click
 30 | - disable Submit comment button if last test case
 31 | - prepare and print table 
 32 | 
 33 | Gradio additional resources
 34 | https://www.gradio.app/docs/gradio/chatbot
 35 | https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks
 36 |     
 37 | """
 38 | import gradio as gr
 39 | import datetime
 40 | from promptLibv2Qwen import countTokens, writehistory, createCatalog
 41 | from promptLibv2Qwen import genRANstring, createStats
 42 | from gradio import ChatMessage
 43 | from openai import OpenAI
 44 | 
 45 | ## PREPARING FINAL DATASET
 46 | 
 47 | pd_id = []
 48 | pd_task = []
 49 | pd_vote = []
 50 | pd_remarks = []
 51 | test_progress = 0
 52 | history = []
 53 | tasks = createCatalog()
 54 | modelname = 'granite3-dense-2b'
 55 | stops = ['<|end_of_text|>']
 56 | logfilename = f'{modelname}_LOG.txt'
 57 | #load client with OpenAI API toward Ollama Endpoint
 58 | client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
 59 | print(f"2. Model {modelname} loaded with OLLAMA...")
 60 | # fizing issue on dipsplaying avatars 
 61 | # https://www.gradio.app/guides/custom-CSS-and-JS
 62 | # https://github.com/gradio-app/gradio/issues/9702
 63 | custom_css = """
 64 |            
 65 |             .message-row img {
 66 |                 margin: 0px !important;
 67 |             }
 68 | 
 69 |             .avatar-container img {
 70 |             padding: 0px !important;
 71 | }
 72 |         """
 73 | 
 74 | def generate_response(history):
 75 |     history.append(
 76 |         ChatMessage(role="user",
 77 |                     content="Hi, my name is Fabio, a Medium writer. Who are you?")
 78 |         )
 79 |     history.append(
 80 |         ChatMessage(role="assistant",
 81 |                     content="Hi, I am your local GPT. How can I help you?")
 82 |         )
 83 |     return history
 84 | 
 85 | history = generate_response(history)
 86 | with gr.Blocks(theme=gr.themes.Glass(), css=custom_css) as demo:
 87 |     #TITLE SECTION
 88 |     with gr.Row(variant='compact'):
 89 |             with gr.Column(scale=1):            
 90 |                 gr.Image(value='img/granite3.png', 
 91 |                         show_label = False, 
 92 |                         show_download_button = False, container = False)              
 93 |             with gr.Column(scale=4):
 94 |                 gr.HTML("<center>"
 95 |                 + "<h1>Revised Benchmark with You as a Feedback!</h1>"
 96 |                 + "<h4>💎 granite3-dense-2B - 4K context window with Ollama engine</h4></center>")  
 97 |                 gr.Markdown("""*Run a prompt catalogue with 11 tasks*
 98 |                             to validate the performances of a Small Langage Model<br>
 99 |                             At the end of every generation the process will wait for the Feedback by the user<br>
100 |                             ### Fixed tuning Parameters:
101 |                             ```
102 |                             temperature = 0.25
103 |                             repeat_penalty = 1.178
104 |                             max_new_tokens = 900
105 | 
106 |                             ```
107 |                             """)
108 |     with gr.Row(variant='compact'): # Progress status
109 |             with gr.Column(scale=1):
110 |                 btn_test = gr.Button(value='Start AutoTest', variant='huggingface')
111 |                 act_task = gr.Text('', placeholder="running task..",show_label=False)
112 |             with gr.Column(scale=4):          
113 |                 actual_progress = gr.Slider(0, len(tasks), 
114 |                                             value=test_progress, label="Prompt Catalogue Progress", 
115 |                                             #info="Run the most used NLP tasks with a Language Model",
116 |                                             interactive=False)    
117 |     with gr.Row(variant='compact'): # KpI
118 |         # with gr.Column():
119 |               txt_ttft = gr.Text('', placeholder="seconds..",
120 |                                    label='Time to first token')
121 |         # with gr.Column():
122 |               txt_gentime = gr.Text('', placeholder="TimeDelta..",
123 |                                    label='Generation Time')
124 |         # with gr.Column():
125 |               txt_speed = gr.Text('', placeholder="t/s..",
126 |                                    label='Generation Speed')
127 |         # with gr.Column():
128 |               txt_TOTtkns = gr.Text('', placeholder="tokens..",
129 |                                    label='Total num of Tokens')
130 | 
131 |     with gr.Row(variant='compact'): # ChatBot Area
132 |                 myBOT =gr.Chatbot(history,type='messages',
133 |                                   avatar_images=("./img/user.jpg","./img/bot.jpg"),
134 |                                   show_copy_button=True,render_markdown=True)      #            
135 | 
136 |     with gr.Row(variant='compact'): # Feedback from the user  variant='compact'
137 |             with gr.Column(scale=1):
138 |                 gr.Markdown("""#### Respect this format:
139 |                             
140 |                             Put a number from 0 to 5, a space, and then your comments<br>
141 |                             ```
142 |                             5 very good one
143 |                             ```
144 |                             """)
145 |                 
146 |             with gr.Column(scale=4):
147 |                 with gr.Row(variant='compact'):       
148 |                         txt_fbck = gr.Textbox('', placeholder="Your evaluation feedback..",
149 |                                     label='User Feedback',lines=3,scale=6)          
150 |                         btn_fbck = gr.Button(value='\n💾 submit feedback\n', variant='primary', 
151 |                                              scale=1,size='lg',)
152 |     
153 |     def firstloop(a,b): #requires the a(actual_progress) and b(Chatbot) widget
154 |         """
155 |         input is actual_progress widget
156 |         outputs are act_task, myBot  widget
157 |         """
158 |         taskName = tasks[a]["task"]
159 |         taskPrompt = tasks[a]["prompt"]
160 |         b.append({"role": "user", "content": taskPrompt})
161 |         return taskName,b
162 |     
163 |     def startInference(a):
164 |         print(a[-1]['content'])
165 |         prompt = [
166 | 			{"role": "user", "content": a[-1]['content']}
167 | 		    ]
168 |         promptTKNS = countTokens(a[-1]['content'])
169 |         a.append({"role":"assistant","content":''})
170 |         generation = ''
171 |         fisrtround=0
172 |         start = datetime.datetime.now()
173 |         completion = client.chat.completions.create(
174 |             messages=prompt,
175 |             model='granite3-dense',
176 |             temperature=0.25,
177 |             frequency_penalty  = 1.178,
178 |             stop=stops,
179 |             max_tokens=900,
180 |             stream=True            
181 |         )
182 |         for chunk in completion:
183 |             try:
184 |                 if chunk.choices[0].delta.content:
185 |                     if fisrtround==0:
186 |                         a[-1]['content'] += chunk.choices[0].delta.content
187 |                         ttftoken = datetime.datetime.now() - start 
188 |                         secondsTTFT =  ttftoken.total_seconds()
189 |                         ttFT = f"TimeToFristToken: {secondsTTFT:.2f} sec"
190 |                         fisrtround = 1
191 |                     else:
192 |                         a[-1]['content'] += chunk.choices[0].delta.content                              
193 |             except:
194 |                 pass  
195 |             answrTKN = countTokens(a[-1]['content'])
196 |             totTKN = promptTKNS + answrTKN
197 |             total_tokens = f"Total Tkns: {totTKN}"
198 |             delta = datetime.datetime.now() - start
199 |             seconds = delta.total_seconds()
200 |             speed = totTKN/seconds
201 |             speed_tokens = f"Gen Speed: {speed:.2f} t/s"
202 |             yield a, delta, speed_tokens, ttFT,total_tokens         
203 |         print(a[-1]['content'])
204 |     btn_test.click(firstloop, inputs=[actual_progress,myBOT],
205 |                     outputs=[act_task,myBOT]).then(startInference,
206 |                     [myBOT],[myBOT,txt_gentime,txt_speed,txt_ttft,txt_TOTtkns])
207 | 
208 |     def saveaLL(a,b,c): # a is Chatbot, b is txt_fdbk c is actual_progress
209 |         pd_id.append(c)
210 |         pd_task.append(tasks[c]["task"])
211 |         pd_vote.append(b[:2])
212 |         pd_remarks.append(b[2:])
213 |         to_save = f"""TASK >>> {tasks[c]["task"]}
214 | 👨‍💻 > {a[-2]['content']}
215 | 💻 > {a[-1]['content']}
216 | USer COMMENT > {b}
217 | ---
218 | 
219 | """
220 |         if c == len(tasks):
221 |              return c, tasks[c]["task"]
222 |         else:
223 |             c += 1
224 |             writehistory(logfilename,to_save)
225 |             return c, tasks[c]["task"]
226 | 
227 |     def startloop():
228 |          #rasie flag to wait
229 |          #start the loop
230 |          #display task
231 |          #append chatbot prompt
232 |          #start the generation
233 |          pass
234 | 
235 |     btn_fbck.click(saveaLL, inputs=[myBOT,txt_fbck,actual_progress],
236 |                     outputs=[actual_progress,act_task]).then(firstloop,
237 |                     [actual_progress,myBOT],[act_task,myBOT]    
238 |                     ).then(startInference,
239 |                     [myBOT],[myBOT,txt_gentime,txt_speed,txt_ttft,txt_TOTtkns])
240 | 
241 | 
242 | if __name__ == "__main__":
243 |     demo.launch(inbrowser=True)


--------------------------------------------------------------------------------
/02.GR-Ollama-Granite_autotest.py:
--------------------------------------------------------------------------------
  1 | # Chat with an intelligent assistant in your terminal  
  2 | # MODEL: ollama-granite3dense
  3 | # this wil run granite3-2B-instruct through ollamaAPI
  4 | # sources: https://github.com/fabiomatricardi/-LLM-Studies/raw/main/00.consoleAPI_stream.py
  5 | # https://github.com/fabiomatricardi/-LLM-Studies/blob/main/01.st-API-openAI_stream.py
  6 | # OLLAMA MODEL CARD: https://ollama.com/library/granite3-dense/blobs/604785e698e9
  7 | # OPenAI API for Ollama: https://github.com/ollama/ollama/blob/main/docs/openai.md
  8 | # https://github.com/ibm-granite/granite-3.0-language-models
  9 | # https://www.ibm.com/granite/docs/
 10 | # HUGGINFACE: https://huggingface.co/ibm-granite/granite-3.0-2b-instruct
 11 | #####################################################################################################
 12 | 
 13 | """
 14 | > ollama show granite3-dense
 15 |   Model
 16 |     architecture        granite
 17 |     parameters          2.6B
 18 |     context length      4096
 19 |     embedding length    2048
 20 |     quantization        Q4_K_M
 21 | 
 22 |   License
 23 |     Apache License
 24 |     Version 2.0, January 2004
 25 | 
 26 | TO DO
 27 | =====
 28 | - clear the comment section upon submission
 29 | - disable main START autotest button after first click
 30 | - disable Submit comment button if last test case
 31 | - prepare and print table 
 32 | 
 33 | Gradio additional resources
 34 | https://www.gradio.app/docs/gradio/chatbot
 35 | https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks
 36 |     
 37 | """
 38 | import gradio as gr
 39 | import datetime
 40 | from promptLibv2Qwen import countTokens, writehistory, createCatalog
 41 | from promptLibv2Qwen import genRANstring, createStats
 42 | from gradio import ChatMessage
 43 | from openai import OpenAI
 44 | 
 45 | ## PREPARING FINAL DATASET
 46 | 
 47 | pd_id = []
 48 | pd_task = []
 49 | pd_vote = []
 50 | pd_remarks = []
 51 | test_progress = 0
 52 | history = []
 53 | tasks = createCatalog()
 54 | modelname = 'granite3-dense-2b'
 55 | stops = ['<|end_of_text|>']
 56 | logfilename = f'{modelname}_LOG.txt'
 57 | #load client with OpenAI API toward Ollama Endpoint
 58 | client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
 59 | 
 60 | # loading Gemma2-2B with llamafile endopoint
 61 | #client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed", organization='SelectedModel')
 62 | print(f"2. Model {modelname} loaded with OLLAMA...")
 63 | # fizing issue on dipsplaying avatars 
 64 | # https://www.gradio.app/guides/custom-CSS-and-JS
 65 | # https://github.com/gradio-app/gradio/issues/9702
 66 | custom_css = """
 67 |            
 68 |             .message-row img {
 69 |                 margin: 0px !important;
 70 |             }
 71 | 
 72 |             .avatar-container img {
 73 |             padding: 0px !important;
 74 | }
 75 |         """
 76 | 
 77 | def generate_response(history):
 78 |     history.append(
 79 |         ChatMessage(role="user",
 80 |                     content="Hi, my name is Fabio, a Medium writer. Who are you?")
 81 |         )
 82 |     history.append(
 83 |         ChatMessage(role="assistant",
 84 |                     content="Hi, I am your local GPT. How can I help you?")
 85 |         )
 86 |     return history
 87 | 
 88 | history = generate_response(history)
 89 | with gr.Blocks(theme=gr.themes.Glass(), css=custom_css) as demo:
 90 |     #TITLE SECTION
 91 |     with gr.Row(variant='compact'):
 92 |             with gr.Column(scale=1):            
 93 |                 gr.Image(value='img/granite3.png', 
 94 |                         show_label = False, 
 95 |                         show_download_button = False, container = False)              
 96 |             with gr.Column(scale=4):
 97 |                 gr.HTML("<center>"
 98 |                 + "<h1>Revised Benchmark with You as a Feedback!</h1>"
 99 |                 + "<h4>💎 granite3-dense-2B - 4K context window with Ollama engine</h4></center>")
100 |                 with gr.Row(variant='compact'):
101 |                     with gr.Column(scale=2):  
102 |                         gr.Markdown("""#### Description:
103 |                                 *Run a prompt catalogue with 11 tasks*
104 |                                 to validate the performances of a Small Langage Model<br>
105 |                                 At the end of every generation the process will wait for the Feedback by the user<br>""")
106 |                     with gr.Column(scale=1): 
107 |                         gr.Markdown("""#### Fixed tuning Parameters:
108 |                                 ```
109 |                                 temperature = 0.25
110 |                                 repeat_penalty = 1.178
111 |                                 max_new_tokens = 900
112 | 
113 |                                 ```
114 |                                 """)
115 |     with gr.Row(variant='compact'): # Progress status
116 |             with gr.Column(scale=1):
117 |                 #btn_test = gr.Button(value='Start AutoTest', variant='secondary')
118 |                 act_task = gr.Text('', placeholder="running task..",show_label=False)
119 |             with gr.Column(scale=4):          
120 |                 actual_progress = gr.Slider(0, len(tasks), 
121 |                                             value=test_progress, label="Prompt Catalogue Progress", 
122 |                                             #info="Run the most used NLP tasks with a Language Model",
123 |                                             interactive=False)    
124 |     with gr.Row(variant='compact'): # Button to start the test
125 |             btn_test = gr.Button(value='Start AutoTest', variant='secondary')
126 |     
127 | 
128 |     with gr.Row(variant='compact'): # KPI and ChatBot Area
129 |             with gr.Column(scale=1):
130 |                 txt_ttft = gr.Text('', placeholder="seconds..",
131 |                                     label='Time to first token')
132 |                 txt_gentime = gr.Text('', placeholder="TimeDelta..",
133 |                                     label='Generation Time')
134 |                 txt_speed = gr.Text('', placeholder="t/s..",
135 |                                     label='Generation Speed')
136 |                 txt_TOTtkns = gr.Text('', placeholder="tokens..",
137 |                                     label='Total num of Tokens')
138 |             with gr.Column(scale=5):    
139 |                 myBOT =gr.Chatbot(history,type='messages',
140 |                                   avatar_images=("./img/user.png","./img/ibm.jpg"),
141 |                                   show_copy_button=True,render_markdown=True,
142 |                                   line_breaks=True)      #autoscroll=True            
143 | 
144 |     with gr.Row(variant='compact'): # Feedback from the user  variant='compact'
145 |             with gr.Column(scale=1):
146 |                 gr.Markdown("""#### Respect this format:
147 |                             
148 |                             Put a number from 0 to 5, a space, and then your comments<br>
149 |                             ```
150 |                             5 very good one
151 |                             ```
152 |                             """)
153 |                 
154 |             with gr.Column(scale=4):
155 |                 with gr.Row(variant='compact'):       
156 |                         txt_fbck = gr.Textbox('', placeholder="Your evaluation feedback..",
157 |                                     label='User Feedback',lines=3,scale=6)          
158 |                         btn_fbck = gr.Button(value='\n💾 submit feedback\n', variant='primary', 
159 |                                              scale=1,size='lg',)
160 |     
161 |     def firstloop(a,b): #requires the a(actual_progress) and b(Chatbot) widget
162 |         """
163 |         input is actual_progress widget
164 |         outputs are act_task, myBot  widget
165 |         """
166 |         taskName = tasks[a]["task"]
167 |         taskPrompt = tasks[a]["prompt"]
168 |         b.append({"role": "user", "content": taskPrompt})
169 |         x = gr.Button(value='Start AutoTest', variant='secondary', visible=False)
170 |         return taskName,b,x
171 |     
172 |     def startInference(a):
173 |         print(a[-1]['content'])
174 |         prompt = [
175 | 			{"role": "user", "content": a[-1]['content']}
176 | 		    ]
177 |         promptTKNS = countTokens(a[-1]['content'])
178 |         
179 |         generation = ''
180 |         fisrtround=0
181 |         start = datetime.datetime.now()
182 |         completion = client.chat.completions.create(
183 |             messages=prompt,
184 |             model='granite3-dense',
185 |             temperature=0.25,
186 |             frequency_penalty  = 1.178,
187 |             stop=stops,
188 |             max_tokens=900,
189 |             #stream=True            
190 |         )
191 |         delta = datetime.datetime.now() - start
192 |         generation = completion.choices[0].message.content
193 |         answrTKN = countTokens(generation)
194 |         totTKN = promptTKNS + answrTKN
195 |         total_tokens = f"Total Tkns: {totTKN}"
196 |         seconds = delta.total_seconds()
197 |         speed = totTKN/seconds
198 |         speed_tokens = f"Gen Speed: {speed:.2f} t/s"    
199 |         ttFT = f"TimeToFristToken: N/A"
200 |         a.append({"role":"assistant","content":generation})
201 |         print(a[-1]['content'])
202 |         return a, delta, speed_tokens, ttFT,total_tokens  
203 |     
204 |     btn_test.click(firstloop, inputs=[actual_progress,myBOT],
205 |                     outputs=[act_task,myBOT,btn_test]).then(startInference,
206 |                     [myBOT],[myBOT,txt_gentime,txt_speed,txt_ttft,txt_TOTtkns])
207 | 
208 |     def saveaLL(a,b,c): # a is Chatbot, b is txt_fdbk c is actual_progress
209 |         pd_id.append(c)
210 |         pd_task.append(tasks[c]["task"])
211 |         pd_vote.append(b[:2])
212 |         pd_remarks.append(b[2:])
213 |         to_save = f"""TASK >>> {tasks[c]["task"]}
214 | 👨‍💻 > {a[-2]['content']}
215 | 💻 > {a[-1]['content']}
216 | USer COMMENT > {b}
217 | ---
218 | 
219 | """
220 |         if c == len(tasks):
221 |              return c, tasks[c]["task"]
222 |         else:
223 |             c += 1
224 |             writehistory(logfilename,to_save)
225 |             return c, tasks[c]["task"]
226 | 
227 |     def tempstartloop(a):
228 |         print(a[-1]['content'])
229 |         out = a[-1]['content']
230 |          #rasie flag to wait
231 |          #start the loop
232 |          #display task
233 |          #append chatbot prompt
234 |          #start the generation
235 |         return out
236 | 
237 |     btn_fbck.click(saveaLL, inputs=[myBOT,txt_fbck,actual_progress],
238 |                     outputs=[actual_progress,act_task]).then(firstloop,
239 |                     [actual_progress,myBOT],[act_task,myBOT,btn_test]    
240 |                     ).then(startInference,
241 |                     [myBOT],[myBOT,txt_gentime,txt_speed,txt_ttft,txt_TOTtkns])
242 | 
243 | 
244 | if __name__ == "__main__":
245 |     demo.launch(inbrowser=True)
246 | 


--------------------------------------------------------------------------------
/02.GR-Ollama-Gemma2B_autotest.py:
--------------------------------------------------------------------------------
  1 | # Chat with an intelligent assistant in your terminal  
  2 | # MODEL: ollama-granite3dense
  3 | # this wil run granite3-2B-instruct through ollamaAPI
  4 | # sources: https://github.com/fabiomatricardi/-LLM-Studies/raw/main/00.consoleAPI_stream.py
  5 | # https://github.com/fabiomatricardi/-LLM-Studies/blob/main/01.st-API-openAI_stream.py
  6 | # OLLAMA MODEL CARD: https://ollama.com/library/granite3-dense/blobs/604785e698e9
  7 | # OPenAI API for Ollama: https://github.com/ollama/ollama/blob/main/docs/openai.md
  8 | # https://github.com/ibm-granite/granite-3.0-language-models
  9 | # https://www.ibm.com/granite/docs/
 10 | # HUGGINFACE: https://huggingface.co/ibm-granite/granite-3.0-2b-instruct
 11 | #####################################################################################################
 12 | 
 13 | """
 14 | > ollama show Gemma2-2B
 15 |   Model
 16 |     architecture        granite
 17 |     parameters          2.6B
 18 |     context length      8196
 19 |     embedding length    2048
 20 |     quantization        Q4_K_M
 21 | 
 22 |   License
 23 |     Apache License
 24 |     Version 2.0, January 2004
 25 | 
 26 | TO DO
 27 | =====
 28 | - clear the comment section upon submission
 29 | - disable main START autotest button after first click
 30 | - disable Submit comment button if last test case
 31 | - prepare and print table 
 32 | 
 33 | Gradio additional resources
 34 | https://www.gradio.app/docs/gradio/chatbot
 35 | https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks
 36 |     
 37 | """
 38 | import gradio as gr
 39 | import datetime
 40 | from promptLibv2Qwen import countTokens, writehistory, createCatalog
 41 | from promptLibv2Qwen import genRANstring, createStats
 42 | from gradio import ChatMessage
 43 | from openai import OpenAI
 44 | 
 45 | ## PREPARING FINAL DATASET
 46 | 
 47 | pd_id = []
 48 | pd_task = []
 49 | pd_vote = []
 50 | pd_remarks = []
 51 | test_progress = 0
 52 | history = []
 53 | tasks = createCatalog()
 54 | modelname = 'Gemma2-2b'
 55 | stops = ['<eos>','<end_of_turn>']
 56 | logfilename = f'{modelname}_LOG.txt'
 57 | #load client with OpenAI API toward Ollama Endpoint
 58 | client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
 59 | 
 60 | # loading Gemma2-2B with llamafile endopoint
 61 | #client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed", organization='SelectedModel')
 62 | print(f"2. Model {modelname} loaded with OLLAMA...")
 63 | # fizing issue on dipsplaying avatars 
 64 | # https://www.gradio.app/guides/custom-CSS-and-JS
 65 | # https://github.com/gradio-app/gradio/issues/9702
 66 | custom_css = """
 67 |            
 68 |             .message-row img {
 69 |                 margin: 0px !important;
 70 |             }
 71 | 
 72 |             .avatar-container img {
 73 |             padding: 0px !important;
 74 | }
 75 |         """
 76 | 
 77 | def generate_response(history):
 78 |     history.append(
 79 |         ChatMessage(role="user",
 80 |                     content="Hi, my name is Fabio, a Medium writer. Who are you?")
 81 |         )
 82 |     history.append(
 83 |         ChatMessage(role="assistant",
 84 |                     content="Hi, I am your local GPT. How can I help you?")
 85 |         )
 86 |     return history
 87 | 
 88 | history = generate_response(history)
 89 | with gr.Blocks(theme=gr.themes.Glass(), css=custom_css) as demo:
 90 |     #TITLE SECTION
 91 |     with gr.Row(variant='compact'):
 92 |             with gr.Column(scale=1):            
 93 |                 gr.Image(value='img/gemma.png', 
 94 |                         show_label = False, 
 95 |                         show_download_button = False, container = False)              
 96 |             with gr.Column(scale=4):
 97 |                 gr.HTML("<center>"
 98 |                 + "<h1>Revised Benchmark with You as a Feedback!</h1>"
 99 |                 + "<h4>💎 Gemma2-2B - 4K context window with Ollama engine</h4></center>")
100 |                 with gr.Row(variant='compact'):
101 |                     with gr.Column(scale=2):  
102 |                         gr.Markdown("""#### Description:
103 |                                 *Run a prompt catalogue with 11 tasks*
104 |                                 to validate the performances of a Small Langage Model<br>
105 |                                 At the end of every generation the process will wait for the Feedback by the user<br>""")
106 |                     with gr.Column(scale=1): 
107 |                         gr.Markdown("""#### Fixed tuning Parameters:
108 |                                 ```
109 |                                 temperature = 0.25
110 |                                 repeat_penalty = 1.178
111 |                                 max_new_tokens = 900
112 | 
113 |                                 ```
114 |                                 """)
115 |     with gr.Row(variant='compact'): # Progress status
116 |             with gr.Column(scale=1):
117 |                 #btn_test = gr.Button(value='Start AutoTest', variant='secondary')
118 |                 act_task = gr.Text('', placeholder="running task..",show_label=False)
119 |             with gr.Column(scale=4):          
120 |                 actual_progress = gr.Slider(0, len(tasks), 
121 |                                             value=test_progress, label="Prompt Catalogue Progress", 
122 |                                             #info="Run the most used NLP tasks with a Language Model",
123 |                                             interactive=False)    
124 |     with gr.Row(variant='compact'): # Button to start the test
125 |             btn_test = gr.Button(value='Start AutoTest', variant='secondary')
126 |     
127 | 
128 |     with gr.Row(variant='compact'): # KPI and ChatBot Area
129 |             with gr.Column(scale=1):
130 |                 txt_ttft = gr.Text('', placeholder="seconds..",
131 |                                     label='Time to first token')
132 |                 txt_gentime = gr.Text('', placeholder="TimeDelta..",
133 |                                     label='Generation Time')
134 |                 txt_speed = gr.Text('', placeholder="t/s..",
135 |                                     label='Generation Speed')
136 |                 txt_TOTtkns = gr.Text('', placeholder="tokens..",
137 |                                     label='Total num of Tokens')
138 |             with gr.Column(scale=5):    
139 |                 myBOT =gr.Chatbot(history,type='messages',
140 |                                   avatar_images=("./img/user.png","./img/assistant.png"),
141 |                                   show_copy_button=True,render_markdown=True,
142 |                                   line_breaks=True)      #autoscroll=True            
143 | 
144 |     with gr.Row(variant='compact'): # Feedback from the user  variant='compact'
145 |             with gr.Column(scale=1):
146 |                 gr.Markdown("""#### Respect this format:
147 |                             
148 |                             Put a number from 0 to 5, a space, and then your comments<br>
149 |                             ```
150 |                             5 very good one
151 |                             ```
152 |                             """)
153 |                 
154 |             with gr.Column(scale=4):
155 |                 with gr.Row(variant='compact'):       
156 |                         txt_fbck = gr.Textbox('', placeholder="Your evaluation feedback..",
157 |                                     label='User Feedback',lines=3,scale=6)          
158 |                         btn_fbck = gr.Button(value='\n💾 submit feedback\n', variant='primary', 
159 |                                              scale=1,size='lg',)
160 |     
161 |     def firstloop(a,b): #requires the a(actual_progress) and b(Chatbot) widget
162 |         """
163 |         input is actual_progress widget
164 |         outputs are act_task, myBot  widget
165 |         """
166 |         taskName = tasks[a]["task"]
167 |         taskPrompt = tasks[a]["prompt"]
168 |         b.append({"role": "user", "content": taskPrompt})
169 |         x = gr.Button(value='Start AutoTest', variant='secondary', visible=False)
170 |         return taskName,b,x
171 |     
172 |     def startInference(a):
173 |         print(a[-1]['content'])
174 |         prompt = [
175 | 			{"role": "user", "content": a[-1]['content']}
176 | 		    ]
177 |         promptTKNS = countTokens(a[-1]['content'])
178 |         
179 |         generation = ''
180 |         fisrtround=0
181 |         start = datetime.datetime.now()
182 |         completion = client.chat.completions.create(
183 |             messages=prompt,
184 |             model='gemma2:2b',
185 |             temperature=0.25,
186 |             frequency_penalty  = 1.178,
187 |             stop=stops,
188 |             max_tokens=900,
189 |             #stream=True            
190 |         )
191 |         delta = datetime.datetime.now() - start
192 |         generation = completion.choices[0].message.content
193 |         answrTKN = countTokens(generation)
194 |         totTKN = promptTKNS + answrTKN
195 |         total_tokens = f"Total Tkns: {totTKN}"
196 |         seconds = delta.total_seconds()
197 |         speed = totTKN/seconds
198 |         speed_tokens = f"Gen Speed: {speed:.2f} t/s"    
199 |         ttFT = f"TimeToFristToken: N/A"
200 |         a.append({"role":"assistant","content":generation})
201 |         print(a[-1]['content'])
202 |         return a, delta, speed_tokens, ttFT,total_tokens  
203 |     
204 |     btn_test.click(firstloop, inputs=[actual_progress,myBOT],
205 |                     outputs=[act_task,myBOT,btn_test]).then(startInference,
206 |                     [myBOT],[myBOT,txt_gentime,txt_speed,txt_ttft,txt_TOTtkns])
207 | 
208 |     def saveaLL(a,b,c): # a is Chatbot, b is txt_fdbk c is actual_progress
209 |         pd_id.append(c)
210 |         pd_task.append(tasks[c]["task"])
211 |         pd_vote.append(b[:2])
212 |         pd_remarks.append(b[2:])
213 |         to_save = f"""TASK >>> {tasks[c]["task"]}
214 | 👨‍💻 > {a[-2]['content']}
215 | 💻 > {a[-1]['content']}
216 | USer COMMENT > {b}
217 | ---
218 | 
219 | """
220 |         if c == len(tasks):
221 |              return c, tasks[c]["task"]
222 |         else:
223 |             c += 1
224 |             writehistory(logfilename,to_save)
225 |             return c, tasks[c]["task"]
226 | 
227 |     def tempstartloop(a):
228 |         print(a[-1]['content'])
229 |         out = a[-1]['content']
230 |          #rasie flag to wait
231 |          #start the loop
232 |          #display task
233 |          #append chatbot prompt
234 |          #start the generation
235 |         return out
236 | 
237 |     btn_fbck.click(saveaLL, inputs=[myBOT,txt_fbck,actual_progress],
238 |                     outputs=[actual_progress,act_task]).then(firstloop,
239 |                     [actual_progress,myBOT],[act_task,myBOT,btn_test]    
240 |                     ).then(startInference,
241 |                     [myBOT],[myBOT,txt_gentime,txt_speed,txt_ttft,txt_TOTtkns])
242 | 
243 | 
244 | if __name__ == "__main__":
245 |     demo.launch(inbrowser=True)
246 | 
247 | 
248 | 
249 | 


--------------------------------------------------------------------------------
/02.GR-Llamafile-Gemma2B_autotest.py:
--------------------------------------------------------------------------------
  1 | # Chat with an intelligent assistant in your terminal  
  2 | # MODEL: ollama-granite3dense
  3 | # this wil run granite3-2B-instruct through ollamaAPI
  4 | # sources: https://github.com/fabiomatricardi/-LLM-Studies/raw/main/00.consoleAPI_stream.py
  5 | # https://github.com/fabiomatricardi/-LLM-Studies/blob/main/01.st-API-openAI_stream.py
  6 | # OLLAMA MODEL CARD: https://ollama.com/library/granite3-dense/blobs/604785e698e9
  7 | # OPenAI API for Ollama: https://github.com/ollama/ollama/blob/main/docs/openai.md
  8 | # https://github.com/ibm-granite/granite-3.0-language-models
  9 | # https://www.ibm.com/granite/docs/
 10 | # HUGGINFACE: https://huggingface.co/ibm-granite/granite-3.0-2b-instruct
 11 | #####################################################################################################
 12 | 
 13 | """
 14 | > ollama show Gemma2-2B
 15 |   Model
 16 |     architecture        granite
 17 |     parameters          2.6B
 18 |     context length      8196
 19 |     embedding length    2048
 20 |     quantization        Q4_K_M
 21 | 
 22 |   License
 23 |     Apache License
 24 |     Version 2.0, January 2004
 25 | 
 26 | TO DO
 27 | =====
 28 | - clear the comment section upon submission
 29 | - disable main START autotest button after first click
 30 | - disable Submit comment button if last test case
 31 | - prepare and print table 
 32 | 
 33 | Gradio additional resources
 34 | https://www.gradio.app/docs/gradio/chatbot
 35 | https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks
 36 |     
 37 | """
 38 | import gradio as gr
 39 | import datetime
 40 | from promptLibv2Qwen import countTokens, writehistory, createCatalog
 41 | from promptLibv2Qwen import genRANstring, createStats
 42 | from gradio import ChatMessage
 43 | from openai import OpenAI
 44 | 
 45 | ## PREPARING FINAL DATASET
 46 | 
 47 | pd_id = []
 48 | pd_task = []
 49 | pd_vote = []
 50 | pd_remarks = []
 51 | test_progress = 0
 52 | history = []
 53 | tasks = createCatalog()
 54 | modelname = 'Gemma2-2b'
 55 | stops = ['<eos>','<end_of_turn>']
 56 | logfilename = f'{modelname}_LOG.txt'
 57 | #load client with OpenAI API toward Ollama Endpoint
 58 | #client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
 59 | 
 60 | # loading Gemma2-2B with llamafile endopoint
 61 | client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed", organization='SelectedModel')
 62 | print(f"2. Model {modelname} loaded with OLLAMA...")
 63 | # fizing issue on dipsplaying avatars 
 64 | # https://www.gradio.app/guides/custom-CSS-and-JS
 65 | # https://github.com/gradio-app/gradio/issues/9702
 66 | custom_css = """
 67 |            
 68 |             .message-row img {
 69 |                 margin: 0px !important;
 70 |             }
 71 | 
 72 |             .avatar-container img {
 73 |             padding: 0px !important;
 74 | }
 75 |         """
 76 | 
 77 | def generate_response(history):
 78 |     history.append(
 79 |         ChatMessage(role="user",
 80 |                     content="Hi, my name is Fabio, a Medium writer. Who are you?")
 81 |         )
 82 |     history.append(
 83 |         ChatMessage(role="assistant",
 84 |                     content="Hi, I am your local GPT. How can I help you?")
 85 |         )
 86 |     return history
 87 | 
 88 | history = generate_response(history)
 89 | with gr.Blocks(theme=gr.themes.Glass(), css=custom_css) as demo:
 90 |     #TITLE SECTION
 91 |     with gr.Row(variant='compact'):
 92 |             with gr.Column(scale=1):            
 93 |                 gr.Image(value='img/gemma.png', 
 94 |                         show_label = False, 
 95 |                         show_download_button = False, container = False)              
 96 |             with gr.Column(scale=4):
 97 |                 gr.HTML("<center>"
 98 |                 + "<h1>Revised Benchmark with You as a Feedback!</h1>"
 99 |                 + "<h4>💎 Gemma2-2B - 4K context window with Ollama engine</h4></center>")
100 |                 with gr.Row(variant='compact'):
101 |                     with gr.Column(scale=2):  
102 |                         gr.Markdown("""#### Description:
103 |                                 *Run a prompt catalogue with 11 tasks*
104 |                                 to validate the performances of a Small Langage Model<br>
105 |                                 At the end of every generation the process will wait for the Feedback by the user<br>""")
106 |                     with gr.Column(scale=1): 
107 |                         gr.Markdown("""#### Fixed tuning Parameters:
108 |                                 ```
109 |                                 temperature = 0.25
110 |                                 repeat_penalty = 1.178
111 |                                 max_new_tokens = 900
112 | 
113 |                                 ```
114 |                                 """)
115 |     with gr.Row(variant='compact'): # Progress status
116 |             with gr.Column(scale=1):
117 |                 #btn_test = gr.Button(value='Start AutoTest', variant='secondary')
118 |                 act_task = gr.Text('', placeholder="running task..",show_label=False)
119 |             with gr.Column(scale=4):          
120 |                 actual_progress = gr.Slider(0, len(tasks), 
121 |                                             value=test_progress, label="Prompt Catalogue Progress", 
122 |                                             #info="Run the most used NLP tasks with a Language Model",
123 |                                             interactive=False)    
124 |     with gr.Row(variant='compact'): # Button to start the test
125 |             btn_test = gr.Button(value='Start AutoTest', variant='secondary')
126 |     
127 | 
128 |     with gr.Row(variant='compact'): # KPI and ChatBot Area
129 |             with gr.Column(scale=1):
130 |                 txt_ttft = gr.Text('', placeholder="seconds..",
131 |                                     label='Time to first token')
132 |                 txt_gentime = gr.Text('', placeholder="TimeDelta..",
133 |                                     label='Generation Time')
134 |                 txt_speed = gr.Text('', placeholder="t/s..",
135 |                                     label='Generation Speed')
136 |                 txt_TOTtkns = gr.Text('', placeholder="tokens..",
137 |                                     label='Total num of Tokens')
138 |             with gr.Column(scale=5):    
139 |                 myBOT =gr.Chatbot(history,type='messages',
140 |                                   avatar_images=("./img/user.png","./img/assistant.png"),
141 |                                   show_copy_button=True,render_markdown=True,
142 |                                   line_breaks=True)      #autoscroll=True             
143 | 
144 |     with gr.Row(variant='compact'): # Feedback from the user  variant='compact'
145 |             with gr.Column(scale=1):
146 |                 gr.Markdown("""#### Respect this format:
147 |                             
148 |                             Put a number from 0 to 5, a space, and then your comments<br>
149 |                             ```
150 |                             5 very good one
151 |                             ```
152 |                             """)
153 |                 
154 |             with gr.Column(scale=4):
155 |                 with gr.Row(variant='compact'):       
156 |                         txt_fbck = gr.Textbox('', placeholder="Your evaluation feedback..",
157 |                                     label='User Feedback',lines=3,scale=6)          
158 |                         btn_fbck = gr.Button(value='\n💾 submit feedback\n', variant='primary', 
159 |                                              scale=1,size='lg',)
160 |     
161 |     def firstloop(a,b): #requires the a(actual_progress) and b(Chatbot) widget
162 |         """
163 |         input is actual_progress widget
164 |         outputs are act_task, myBot  widget
165 |         """
166 |         taskName = tasks[a]["task"]
167 |         taskPrompt = tasks[a]["prompt"]
168 |         b.append({"role": "user", "content": taskPrompt})
169 |         x = gr.Button(value='Start AutoTest', variant='secondary', visible=False)
170 |         return taskName,b,x
171 |     
172 |     def startInference(a):
173 |         print(a[-1]['content'])
174 |         prompt = [
175 | 			{"role": "user", "content": a[-1]['content']}
176 | 		    ]
177 |         promptTKNS = countTokens(a[-1]['content'])
178 |         
179 |         generation = ''
180 |         fisrtround=0
181 |         start = datetime.datetime.now()
182 |         completion = client.chat.completions.create(
183 |             messages=prompt,
184 |             model='gemma2:2b',
185 |             temperature=0.25,
186 |             frequency_penalty  = 1.178,
187 |             stop=stops,
188 |             max_tokens=900,
189 |             #stream=True            
190 |         )
191 |         delta = datetime.datetime.now() - start
192 |         generation = completion.choices[0].message.content
193 |         answrTKN = countTokens(generation)
194 |         totTKN = promptTKNS + answrTKN
195 |         total_tokens = f"Total Tkns: {totTKN}"
196 |         seconds = delta.total_seconds()
197 |         speed = totTKN/seconds
198 |         speed_tokens = f"Gen Speed: {speed:.2f} t/s"    
199 |         ttFT = f"TimeToFristToken: N/A"
200 |         a.append({"role":"assistant","content":generation})
201 |         print(a[-1]['content'])
202 |         return a, delta, speed_tokens, ttFT,total_tokens  
203 |     
204 |     btn_test.click(firstloop, inputs=[actual_progress,myBOT],
205 |                     outputs=[act_task,myBOT,btn_test]).then(startInference,
206 |                     [myBOT],[myBOT,txt_gentime,txt_speed,txt_ttft,txt_TOTtkns])
207 | 
208 |     def saveaLL(a,b,c): # a is Chatbot, b is txt_fdbk c is actual_progress
209 |         pd_id.append(c)
210 |         pd_task.append(tasks[c]["task"])
211 |         pd_vote.append(b[:2])
212 |         pd_remarks.append(b[2:])
213 |         to_save = f"""TASK >>> {tasks[c]["task"]}
214 | 👨‍💻 > {a[-2]['content']}
215 | 💻 > {a[-1]['content']}
216 | USer COMMENT > {b}
217 | ---
218 | 
219 | """
220 |         if c == len(tasks):
221 |              return c, tasks[c]["task"]
222 |         else:
223 |             c += 1
224 |             writehistory(logfilename,to_save)
225 |             return c, tasks[c]["task"]
226 | 
227 |     def tempstartloop(a):
228 |         print(a[-1]['content'])
229 |         out = a[-1]['content']
230 |          #rasie flag to wait
231 |          #start the loop
232 |          #display task
233 |          #append chatbot prompt
234 |          #start the generation
235 |         return out
236 | 
237 |     btn_fbck.click(saveaLL, inputs=[myBOT,txt_fbck,actual_progress],
238 |                     outputs=[actual_progress,act_task]).then(firstloop,
239 |                     [actual_progress,myBOT],[act_task,myBOT,btn_test]    
240 |                     ).then(startInference,
241 |                     [myBOT],[myBOT,txt_gentime,txt_speed,txt_ttft,txt_TOTtkns])
242 | 
243 | 
244 | if __name__ == "__main__":
245 |     demo.launch(inbrowser=True)
246 | 
247 | 
248 | 
249 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GradioRBYF
  2 | Gradio Interface for automatic Revised Benchmark with You as a Feedback
  3 | 
  4 | This repo contains the code to run a full Auto-Test of the prompt Catalogue for Revised Benchmark wity You as a Feebback using the following stacck:
  5 | - Ollama (as a backaend to serve the Models)
  6 | - Gradio (as UI forntend)
  7 | 
  8 | ### The model we use here
  9 | #### granite3-dense (2B)
 10 | The IBM Granite 2B and 8B models are designed to support tool-based use cases and support for retrieval augmented generation (RAG), streamlining code generation, translation and bug fixing.
 11 | ```
 12 | ollama run granite3-dense
 13 | ```
 14 | Model card shortly (from [source](https://ollama.com/library/granite3-dense))
 15 | ```
 16 | arch:         granite
 17 | parameters:   2.63B
 18 | quantization: Q4_K_M
 19 | GGUF size:    1.6GB
 20 | ```
 21 | 
 22 | 
 23 | ### how to use it
 24 | - clone the repo
 25 | - install the requirements
 26 | - [download and install Ollama](https://ollama.com/download)
 27 | - download and run granite3-dense (2B)
 28 | - run the gradio app
 29 | 
 30 | 
 31 | ---
 32 | 
 33 | 
 34 | # About Gradio
 35 | Issues on avatars in the chatbot
 36 | - [https://www.gradio.app/guides/custom-CSS-and-JS](https://www.gradio.app/guides/custom-CSS-and-JS)
 37 | - [https://github.com/gradio-app/gradio/issues/9702](https://github.com/gradio-app/gradio/issues/9702)
 38 | - [Gradio Theming](https://www.gradio.app/guides/theming-guide)
 39 | ```
 40 | gr.themes.Base() - the "base" theme sets the primary color to blue but otherwise has minimal styling, making it particularly useful as a base for creating new, custom themes.
 41 | gr.themes.Default() - the "default" Gradio 5 theme, with a vibrant orange primary color and gray secondary color.
 42 | gr.themes.Origin() - the "origin" theme is most similar to Gradio 4 styling. Colors, especially in light mode, are more subdued than the Gradio 5 default theme.
 43 | gr.themes.Citrus() - the "citrus" theme uses a yellow primary color, highlights form elements that are in focus, and includes fun 3D effects when buttons are clicked.
 44 | gr.themes.Monochrome() - the "monochrome" theme uses a black primary and white secondary color, and uses serif-style fonts, giving the appearance of a black-and-white newspaper.
 45 | gr.themes.Soft() - the "soft" theme uses a purpose primary color and white secondary color. It also increases the border radii and around buttons and form elements and highlights labels.
 46 | gr.themes.Glass() - the "glass" theme has a blue primary color and a transclucent gray secondary color. The theme also uses vertical gradients to create a glassy effect.
 47 | gr.themes.Ocean() - the "ocean" theme has a blue-green primary color and gray secondary color. The theme also uses horizontal gradients, especially for buttons and some form elements.
 48 | ```
 49 | - [Gradio Theme Gallery](https://huggingface.co/spaces/gradio/theme-gallery)
 50 | - [GUIDE How to Create a Chatbot with Gradio](https://www.gradio.app/guides/creating-a-chatbot-fast)
 51 | 
 52 | ### Newspaper3k: Article scraping & curation
 53 | - [https://github.com/codelucas/newspaper](https://github.com/codelucas/newspaper)
 54 | - [https://github.com/fabiomatricardi/MetadataIsAllYouNeed/blob/main/KeyBERT_gr.py](https://github.com/fabiomatricardi/MetadataIsAllYouNeed/blob/main/KeyBERT_gr.py)
 55 | 
 56 | 
 57 | ### General inference rules
 58 | The Mistral models allows you to chat with a model that has been fine-tuned to follow instructions and respond to natural language prompts. A prompt is the input that you provide to the Mistral model. It can come in various forms, such as asking a question, giving an instruction, or providing a few examples of the task you want the model to perform. Based on the prompt, the Mistral model generates a text output as a response.
 59 | The chat completion API accepts a list of chat messages as input and generates a response. This response is in the form of a new chat message with the role "assistant" as output.
 60 | - [https://docs.mistral.ai/capabilities/completion/](https://docs.mistral.ai/capabilities/completion/)
 61 | - 
 62 | 
 63 | # Models to try
 64 | ### H2OVL-Mississippi-800M
 65 | The H2OVL-Mississippi-800M is a compact yet powerful vision-language model from H2O.ai, featuring 0.8 billion parameters. Despite its small size, it delivers state-of-the-art performance in text recognition, excelling in the Text Recognition segment of OCRBench and outperforming much larger models in this domain. Built upon the robust architecture of our H2O-Danube language models, the Mississippi-800M extends their capabilities by seamlessly integrating vision and language tasks.
 66 | 
 67 | <img src='https://huggingface.co/h2oai/h2ovl-mississippi-800m/resolve/main/assets/text_recognition.png' width=500>
 68 | - [HuggingFace repo at H2O](https://huggingface.co/h2oai/h2ovl-mississippi-800m)
 69 | - [H2O article by Asghar Ghorbani](https://h2o.ai/blog/2024/document-classification-with-h2o-vl-mississippi--a-quick-guide/)
 70 | ### MaziyarPanahi's Collections
 71 | 🚀GGUF Llama.cpp compatible models, can be used on CPUs and GPUs!<br>
 72 | Here the [AMAZING COLLECTION](https://huggingface.co/collections/MaziyarPanahi/gguf-65afc99c3997c4b6d2d9e1d5)
 73 | ### h2oai/deberta_finetuned_pii
 74 | A finetuned model designed to recognize and classify Personally Identifiable Information (PII) within unstructured text data. This powerful model accurately identifies a wide range of PII categories, such as account names, credit card numbers, emails, phone numbers, and addresses. The model is specifically trained to detect various PII types, including but not limited to:
 75 | 
 76 | ```
 77 | | Category               | Data                                                                                   |
 78 | |------------------------|----------------------------------------------------------------------------------------|
 79 | | Account-related information | Account name, account number, and transaction amounts                             |
 80 | | Banking details        | BIC, IBAN, and Bitcoin or Ethereum addresses                                           |
 81 | | Personal information   | Full name, first name, middle name, last name, gender, and date of birth               |
 82 | | Contact information    | Email, phone number, and street address (including building number, city, county, state, and zip code) |
 83 | | Job-related data       | Job title, job area, job descriptor, and job type                                      |
 84 | | Financial data         | Credit card number, issuer, CVV, and currency information (code, name, and symbol)     |
 85 | | Digital identifiers    | IP addresses (IPv4 and IPv6), MAC addresses, and user agents                           |
 86 | | Online presence        | URL, usernames, and passwords                                                          |
 87 | | Other sensitive data   | SSN, vehicle VIN and VRM, phone IMEI, and nearby GPS coordinates                       |
 88 | 
 89 | ```
 90 | 
 91 | The PII Identifier Model ensures data privacy and compliance by effectively detecting and categorizing sensitive information within documents, emails, user-generated content, and more. Make your data processing safer and more secure with our state-of-the-art PII detection technology.
 92 | - [Hugging Face Repo](https://huggingface.co/h2oai/deberta_finetuned_pii)
 93 | - 
 94 | 
 95 | 
 96 | 
 97 | # Large Language Models course free
 98 | - [llm-engineering-handbook](https://github.com/aofoegbu/llm-engineers-handbook)
 99 | - [DeepLearning.AI course on Agents](https://learn.deeplearning.ai/courses/practical-multi-ai-agents-and-advanced-use-cases-with-crewai/lesson/1/introduction)
100 | - [Microsoft BitNet.cpp](https://github.com/microsoft/BitNet)
101 | - [ArXiv paper *1-bit AI Infra: Part 1.1, Fast and Lossless BitNet b1.58 Inference on CPUs*](https://arxiv.org/abs/2410.16144v1)
102 | - [MotleyCrew AI](https://motleycrew.ai/)
103 | - [MotleyCrewAI-readthedocs](https://motleycrew.readthedocs.io/en/latest/quickstart.html)
104 | - [OpenVino quick guide CheatSheet](https://docs.openvino.ai/2024/_static/download/OpenVINO_Quick_Start_Guide.pdf)
105 | - [OpenVino Toolkit Getting Started](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/get-started.html)
106 | 
107 | # About IBM/Granite  models
108 | - [Ollama serving](https://ollama.com/library/granite3-moe:1b)
109 | - [Granite3 IbM on GitHub](https://github.com/ibm-granite/granite-3.0-language-models)
110 | -  [Fabio you are the Benchmark](https://github.com/fabiomatricardi/YouAreTheBenchmark)
111 | -  
112 | 
113 | 
114 | ## Markdown resoources
115 | - [MArkdown Videos](https://github.com/Snailedlt/Markdown-Videos)
116 | - 
117 | 
118 | 
119 | 
120 | 
121 | ## [Universal Assisted Generation: Faster Decoding with Any Assistant Model](https://huggingface.co/blog/universal_assisted_generation)
122 | TL;DR: Many LLMs such as gemma-2-9b and Mixtral-8x22B-Instruct-v0.1 lack a much smaller version to use for assisted generation. In this blog post, we present Universal Assisted Generation: a method developed by Intel Labs and Hugging Face which extends assisted generation to work with a small language model from any model family 🤯. As a result, it is now possible to accelerate inference from any decoder or Mixture of Experts model by 1.5x-2.0x at almost zero-cost 🔥🔥🔥!
123 | In order to mitigate this pain point, Intel Labs, together with our friends at Hugging Face, has developed Universal Assisted Generation (UAG). UAG enables selecting any pair of target and assistant models regardless of their tokenizer. For example, gemma-2-9b can be used as the target model, with the tiny vicuna-68m as the assistant.
124 | The main idea behind the method we propose is 2-way tokenizer translations. Once the assistant model completes a generation iteration, the assistant tokens are converted to text, which is then tokenized using the target model's tokenizer to generate target tokens. After the verification step, the target tokens are similarly converted back to the assistant tokens format, which are then appended to the assistant model's context before the next iteration begins.
125 | Since the assistant and target tokenizers use different vocabularies it's necessary to handle the discrepancies between them. To accurately re-encode the newly generated assistant tokens, it’s essential to prepend a context window consisting of several previous tokens. This entire sequence is then re-encoded into the target token format and aligned with the most recent target tokens to pinpoint the exact location where the newly generated tokens should be appended. This process is illustrated in the video below.
126 | 
127 | 
128 | 
129 | 
130 | https://github.com/user-attachments/assets/5d10e09f-b5c5-40a7-8414-1dd0465593b4
131 | 
132 | 
133 | 
134 | ```
135 | [![Watch the video](https://github.com/fabiomatricardi/GradioRBYF/blob/main/img/videoframe_11799.png)](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/universal-assisted-generation/method-animation.mov)
136 | ```
137 | 
138 | ```
139 | <video width="320" height="240" controls>
140 |   <source src="[video.mov](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/universal-assisted-generation/method-animation.mov)" type="video/mp4">
141 | </video>
142 | ```
143 | References for videos in github:
144 | - [https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md](https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md)
145 | - [https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md/4279746#4279746](https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md/4279746#4279746)
146 | - [https://github.com/alelievr/Mixture/blob/0.4.0/README.md](https://github.com/alelievr/Mixture/blob/0.4.0/README.md)
147 | - [https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md](https://stackoverflow.com/questions/4279611/how-to-embed-a-video-into-github-readme-md)
148 | - [https://www.geeksforgeeks.org/how-to-add-videos-on-readme-md-file-in-a-github-repository/](https://www.geeksforgeeks.org/how-to-add-videos-on-readme-md-file-in-a-github-repository/)
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------
/gradio-rbyf_chat.py:
--------------------------------------------------------------------------------
  1 | """"
  2 | GRADIO INTERFACE
  3 | with llama.cpp core engine
  4 | using Llama.generate() default method, with no chat_template
  5 | This playground does not show history messages, and it is not
  6 | meant to be a chat-bot
  7 | But all generations are logged in text file
  8 | """
  9 | 
 10 | import gradio as gr
 11 | from llama_cpp import Llama
 12 | import datetime
 13 | 
 14 | #MODEL SETTINGS also for DISPLAY
 15 | convHistory = ''
 16 | modelfile = "models/qwen2.5-0.5b-instruct-q8_0.gguf"
 17 | root='qwen2.5-0.5b-instruct'
 18 | repetitionpenalty = 1.15
 19 | contextlength=8196
 20 | logfile = f'{root}_GR_logs.txt'
 21 | print("loading model...")
 22 | stt = datetime.datetime.now()
 23 | llm = Llama(
 24 |   model_path=modelfile,  # Download the model file first
 25 |   n_ctx=contextlength,  # The max sequence length to use - note that longer sequence lengths require much more resources
 26 |   #n_threads=2,            # The number of CPU threads to use, tailor to your system and the resulting performance
 27 | )
 28 | dt = datetime.datetime.now() - stt
 29 | print(f"Model loaded in {dt}")
 30 | 
 31 | def writehistory(text):
 32 |     with open(logfile, 'a') as f:
 33 |         f.write(text)
 34 |         f.write('\n')
 35 |     f.close()
 36 | 
 37 | def combine(a, b, c, d,e,f):
 38 |     global convHistory
 39 |     import datetime
 40 |     SYSTEM_PROMPT = f"""{a}
 41 | 
 42 | 
 43 |     """ 
 44 |     temperature = c
 45 |     max_new_tokens = d
 46 |     repeat_penalty = f
 47 |     top_p = e
 48 |     prompt = [
 49 | 			{"role": "user", "content": b}
 50 | 		]
 51 |     start = datetime.datetime.now()
 52 |     generation = ""
 53 |     delta = ""
 54 |     promptTKN = len(llm.tokenize(bytes(b,encoding='utf-8')))
 55 |     prompt_tokens = f"Prompt Tokens: {promptTKN}"
 56 |     generated_text = ""
 57 |     answer_tokens = ''
 58 |     total_tokens = ''   
 59 |     fisrtround=0
 60 |     for chunk in llm.create_chat_completion(messages=prompt, 
 61 |                 max_tokens=max_new_tokens, 
 62 |                 stop=['<|im_end|>'],
 63 |                 temperature = temperature,
 64 |                 repeat_penalty = repeat_penalty,
 65 |                 top_p = top_p,   # Example stop token - not necessarily correct for this specific model! Please check before using.
 66 |                 stream=True):
 67 |         try:
 68 |             if chunk["choices"][0]["delta"]["content"]:
 69 |                 if fisrtround==0:
 70 |                     print(chunk["choices"][0]["delta"]["content"], end="", flush=True)
 71 |                     generation += chunk["choices"][0]["delta"]["content"]
 72 |                     ttftoken = datetime.datetime.now() - start 
 73 |                     secondsTTFT =  ttftoken.total_seconds()
 74 |                     total_tokens = f"TimeToFristToken: {secondsTTFT:.2f} sec"
 75 |                     fisrtround = 1
 76 |                 else:
 77 |                     print(chunk["choices"][0]["delta"]["content"], end="", flush=True)
 78 |                     generation += chunk["choices"][0]["delta"]["content"]                              
 79 |         except:
 80 |             pass  
 81 |         answrTKN = len(llm.tokenize(bytes(generation,encoding='utf-8')))
 82 |         answer_tokens = f"Out Tkns: {answrTKN}"
 83 |         totTKN = promptTKN + answrTKN
 84 |         #total_tokens = f"Total Tkns: {totTKN}"
 85 |         delta = datetime.datetime.now() - start
 86 |         seconds = delta.total_seconds()
 87 |         speed = totTKN/seconds
 88 |         speed_tokens = f"Gen Speed: {speed:.2f} t/s"
 89 |         yield generation, delta, prompt_tokens, answer_tokens, total_tokens, speed_tokens
 90 |     timestamp = datetime.datetime.now()
 91 |     logger = f"""____________________________________________________
 92 | time: {timestamp}
 93 | Temp: {temperature} - MaxNewTokens: {max_new_tokens} 
 94 | RepPenalty: {repeat_penalty} 
 95 | ____________________________________________________
 96 | PROMPT: \n{prompt}
 97 | {root}: {generation}
 98 | ---
 99 | Generated in {delta}
100 | {prompt_tokens}
101 | {answer_tokens}
102 | {total_tokens}
103 | Total Tokens: {totTKN}
104 | Generation Speed: {speed:.2f} t/s
105 | ---"""
106 |     writehistory(logger)
107 |     convHistory = convHistory + b + "\n" + generation + "\n"
108 |     print(convHistory)
109 |     return generation, delta, prompt_tokens, answer_tokens, total_tokens    
110 | 
111 | # MAIN GRADIO INTERFACE
112 | with gr.Blocks(theme=gr.themes.Glass()) as demo:   #theme= 'Medguy/base2' #theme='remilia/Ghostly'
113 |     #TITLE SECTION
114 |     with gr.Row(variant='compact'):
115 |             with gr.Column(scale=1):            
116 |                 gr.Image(value='img/qwen.png', 
117 |                         show_label = False, 
118 |                         show_download_button = False, container = False)              
119 |             with gr.Column(scale=4):
120 |                 gr.HTML("<center>"
121 |                 + "<h3>Revised Benchmark with You as a Feedback!</h3>"
122 |                 + "<h1>💎 Qwen2.5-0.5B-it - 8K context window</h2></center>")  
123 |                 with gr.Row():
124 |                         with gr.Column(min_width=80):
125 |                             gentime = gr.Textbox(value="", placeholder="Generation Time:", min_width=50, show_label=False)                          
126 |                         with gr.Column(min_width=80):
127 |                             prompttokens = gr.Textbox(value="", placeholder="Prompt Tkn:", min_width=50, show_label=False)
128 |                         with gr.Column(min_width=80):
129 |                             outputokens = gr.Textbox(value="", placeholder="Output Tkn:", min_width=50, show_label=False)            
130 |                         with gr.Column(min_width=80):
131 |                             totaltokens = gr.Textbox(value="", placeholder="Total Tokens:", min_width=50, show_label=False)   
132 |                         with gr.Column(min_width=80):
133 |                             genspeed = gr.Textbox(value="", placeholder="Generation Speed:", min_width=50, show_label=False)  
134 | 
135 |     # PLAYGROUND INTERFACE SECTION
136 |     with gr.Row():
137 |         with gr.Column(scale=1):
138 |             gr.Markdown(
139 |             f"""
140 |             ### Tunning Parameters""")
141 |             temp = gr.Slider(label="Temperature",minimum=0.0, maximum=1.0, step=0.01, value=0.42)
142 |             top_p = gr.Slider(label="Top_P",minimum=0.0, maximum=1.0, step=0.01, value=0.8)
143 |             repPen = gr.Slider(label="Repetition Penalty",minimum=0.0, maximum=4.0, step=0.01, value=1.2)
144 |             max_len = gr.Slider(label="Maximum output lenght", minimum=10,maximum=(contextlength-500),step=2, value=900)
145 |             gr.Markdown(
146 |             """
147 |             Fill the System Prompt and User Prompt
148 |             And then click the Button below
149 |             """)
150 |             btn = gr.Button(value="💎🦜 Generate", variant='primary')
151 |             gr.Markdown(
152 |             f"""
153 |             - **Prompt Template**: Qwen
154 |             - **LLM Engine**: llama-cpp
155 |             - **Log File**: {logfile}
156 |             """) 
157 | 
158 |         with gr.Column(scale=4):
159 |             txt = gr.Textbox(label="System Prompt", value = "", placeholder = "This models does not have any System prompt...",lines=1, interactive = False)
160 |             txt_2 = gr.Textbox(label="User Prompt", lines=5, show_copy_button=True)
161 |             txt_3 = gr.Textbox(value="", label="Output", lines = 10, show_copy_button=True)
162 |             eval = gr.Textbox(label="RBYF - Your feedback", lines=2, 
163 |                               placeholder = "smething like '4 there are some issues'...",
164 |                               show_copy_button=True)
165 |             btn2 = gr.Button(value="Save Comment", variant='secondary')
166 |             def savelogs(a):
167 |                 logging = f"YOUR FEEDBACK:\n{a}\n____________________________________________________\n\n\n"
168 |                 writehistory(logging)
169 |                 print(logging)
170 |             btn.click(combine, inputs=[txt, txt_2,temp,max_len,top_p,repPen], outputs=[txt_3,gentime,prompttokens,outputokens,totaltokens, genspeed])
171 |             btn2.click(savelogs,inputs=[eval], outputs=[])
172 | 
173 | if __name__ == "__main__":
174 |     demo.launch(inbrowser=True)
175 | 
176 | 
177 | """
178 | MODEL CARD
179 | ______________________________________________________________________________________________________
180 | Python 3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)] on win32
181 | Type "help", "copyright", "credits" or "license" for more information.
182 | >>> mp = 'models/qwen2.5-0.5b-instruct-q8_0.gguf'
183 | >>> from llama_cpp import Llama
184 | >>> llm = Llama(model_path=mp)
185 | llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors 
186 | from models/qwen2.5-0.5b-instruct-q8_0.gguf (version GGUF V3 (latest))
187 | llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
188 |            general.architecture str              = qwen2
189 |                    general.type str              = model
190 |                    general.name str              = qwen2.5-0.5b-instruct
191 |                 general.version str              = v0.1
192 |                general.finetune str              = qwen2.5-0.5b-instruct
193 |              general.size_label str              = 630M
194 |               qwen2.block_count u32              = 24
195 |            qwen2.context_length u32              = 32768
196 |          qwen2.embedding_length u32              = 896
197 |       qwen2.feed_forward_length u32              = 4864
198 |      qwen2.attention.head_count u32              = 14
199 |   qwen2.attention.head_count_kv u32              = 2
200 |            qwen2.rope.freq_base f32              = 1000000.000000
201 | ttention.layer_norm_rms_epsilon f32              = 0.000001
202 |               general.file_type u32              = 7
203 |            tokenizer.ggml.model str              = gpt2
204 |              tokenizer.ggml.pre str              = qwen2
205 |           tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
206 |       tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
207 |           tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
208 |     tokenizer.ggml.eos_token_id u32              = 151645
209 | tokenizer.ggml.padding_token_id u32              = 151643
210 |     tokenizer.ggml.bos_token_id u32              = 151643
211 |    tokenizer.ggml.add_bos_token bool             = false
212 |         tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
213 |    general.quantization_version u32              = 2
214 | llama_model_loader: - type  f32:  121 tensors
215 | llama_model_loader: - type q8_0:  170 tensors
216 | llm_load_vocab: special tokens cache size = 22
217 | llm_load_vocab: token to piece cache size = 0.9310 MB
218 | llm_load_print_meta: format           = GGUF V3 (latest)
219 | llm_load_print_meta: arch             = qwen2
220 | llm_load_print_meta: vocab type       = BPE
221 | llm_load_print_meta: n_vocab          = 151936
222 | llm_load_print_meta: n_merges         = 151387
223 | llm_load_print_meta: vocab_only       = 0
224 | llm_load_print_meta: n_ctx_train      = 32768
225 | llm_load_print_meta: n_embd           = 896
226 | llm_load_print_meta: n_layer          = 24
227 | llm_load_print_meta: n_head           = 14
228 | llm_load_print_meta: n_head_kv        = 2
229 | llm_load_print_meta: n_rot            = 64
230 | llm_load_print_meta: n_swa            = 0
231 | llm_load_print_meta: n_embd_head_k    = 64
232 | llm_load_print_meta: n_embd_head_v    = 64
233 | llm_load_print_meta: n_gqa            = 7
234 | llm_load_print_meta: n_embd_k_gqa     = 128
235 | llm_load_print_meta: n_embd_v_gqa     = 128
236 | llm_load_print_meta: f_norm_eps       = 0.0e+00
237 | llm_load_print_meta: f_norm_rms_eps   = 1.0e-06
238 | llm_load_print_meta: f_clamp_kqv      = 0.0e+00
239 | llm_load_print_meta: f_max_alibi_bias = 0.0e+00
240 | llm_load_print_meta: f_logit_scale    = 0.0e+00
241 | llm_load_print_meta: n_ff             = 4864
242 | llm_load_print_meta: n_expert         = 0
243 | llm_load_print_meta: n_expert_used    = 0
244 | llm_load_print_meta: causal attn      = 1
245 | llm_load_print_meta: pooling type     = 0
246 | llm_load_print_meta: rope type        = 2
247 | llm_load_print_meta: rope scaling     = linear
248 | llm_load_print_meta: freq_base_train  = 1000000.0
249 | llm_load_print_meta: freq_scale_train = 1
250 | llm_load_print_meta: n_ctx_orig_yarn  = 32768
251 | llm_load_print_meta: rope_finetuned   = unknown
252 | llm_load_print_meta: ssm_d_conv       = 0
253 | llm_load_print_meta: ssm_d_inner      = 0
254 | llm_load_print_meta: ssm_d_state      = 0
255 | llm_load_print_meta: ssm_dt_rank      = 0
256 | llm_load_print_meta: ssm_dt_b_c_rms   = 0
257 | llm_load_print_meta: model type       = 1B
258 | llm_load_print_meta: model ftype      = Q8_0
259 | llm_load_print_meta: model params     = 630.17 M
260 | llm_load_print_meta: model size       = 638.74 MiB (8.50 BPW)
261 | llm_load_print_meta: general.name     = qwen2.5-0.5b-instruct
262 | llm_load_print_meta: BOS token        = 151643 '<|endoftext|>'
263 | llm_load_print_meta: EOS token        = 151645 '<|im_end|>'
264 | llm_load_print_meta: PAD token        = 151643 '<|endoftext|>'
265 | llm_load_print_meta: LF token         = 148848 'ÄĬ'
266 | llm_load_print_meta: EOT token        = 151645 '<|im_end|>'
267 | llm_load_print_meta: EOG token        = 151643 '<|endoftext|>'
268 | llm_load_print_meta: EOG token        = 151645 '<|im_end|>'
269 | llm_load_print_meta: max token length = 256
270 | ggml_vulkan: Found 1 Vulkan devices:
271 | Vulkan0: Intel(R) UHD Graphics (Intel Corporation) | uma: 1 | fp16: 1 | warp size: 32
272 | llm_load_tensors: ggml ctx size =    0.13 MiB
273 | llm_load_tensors: offloading 0 repeating layers to GPU
274 | llm_load_tensors: offloaded 0/25 layers to GPU
275 | llm_load_tensors:        CPU buffer size =   638.74 MiB
276 | ...........................................................
277 | llama_new_context_with_model: n_ctx      = 512
278 | llama_new_context_with_model: n_batch    = 512
279 | llama_new_context_with_model: n_ubatch   = 512
280 | llama_new_context_with_model: flash_attn = 0
281 | llama_new_context_with_model: freq_base  = 1000000.0
282 | llama_new_context_with_model: freq_scale = 1
283 | llama_kv_cache_init: Vulkan_Host KV buffer size =     6.00 MiB
284 | llama_new_context_with_model: KV self size  =    6.00 MiB, K (f16):    3.00 MiB, V (f16):    3.00 MiB
285 | llama_new_context_with_model: Vulkan_Host  output buffer size =     0.58 MiB
286 | llama_new_context_with_model: Intel(R) UHD Graphics compute buffer size =   436.44 MiB
287 | llama_new_context_with_model: Vulkan_Host compute buffer size =     2.76 MiB
288 | llama_new_context_with_model: graph nodes  = 846
289 | llama_new_context_with_model: graph splits = 340
290 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | 
291 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | 
292 | RISCV_VECT = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
293 | Model metadata: {'general.name': 'qwen2.5-0.5b-instruct', 'general.architecture': 'qwen2', 
294 | 'general.type': 'model', 'general.finetune': 'qwen2.5-0.5b-instruct', 'general.version': 'v0.1', 
295 | 'qwen2.block_count': '24', 'general.size_label': '630M', 'qwen2.context_length': '32768', 
296 | 'qwen2.embedding_length': '896', 'general.quantization_version': '2', 
297 | 'tokenizer.ggml.bos_token_id': '151643', 'qwen2.feed_forward_length': '4864', 'qwen2.attention.head_count': '14', 
298 | 'qwen2.attention.head_count_kv': '2', 'tokenizer.ggml.padding_token_id': '151643', 
299 | 'qwen2.rope.freq_base': '1000000.000000', 'qwen2.attention.layer_norm_rms_epsilon': '0.000001', 
300 | 'tokenizer.ggml.eos_token_id': '151645', 'general.file_type': '7', 'tokenizer.ggml.model': 'gpt2', 
301 | 'tokenizer.ggml.pre': 'qwen2', 'tokenizer.ggml.add_bos_token': 'false', 
302 | 'tokenizer.chat_template': '{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- messages[0][\'content\'] }}\n    {%- else %}\n        {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n    {%- endif %}\n    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}\n    {%- else %}\n        {{- \'<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n\' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}\n        {{- \'<|im_start|>\' + message.role + \'\\n\' + message.content + \'<|im_end|>\' + \'\\n\' }}\n    {%- elif message.role == "assistant" %}\n        {{- \'<|im_start|>\' + message.role }}\n        {%- if message.content %}\n            {{- \'\\n\' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- \'\\n<tool_call>\\n{"name": "\' }}\n            {{- tool_call.name }}\n            {{- \'", "arguments": \' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \'}\\n</tool_call>\' }}\n        {%- endfor %}\n        {{- \'<|im_end|>\\n\' }}\n    {%- elif message.role == "tool" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}\n            {{- \'<|im_start|>user\' }}\n        {%- endif %}\n        {{- \'\\n<tool_response>\\n\' }}\n        {{- message.content }}\n        {{- \'\\n</tool_response>\' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n            {{- \'<|im_end|>\\n\' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- \'<|im_start|>assistant\\n\' }}\n{%- endif %}\n'}
303 | Available chat formats from metadata: chat_template.default
304 | Using gguf chat template: {%- if tools %}
305 |     {{- '<|im_start|>system\n' }}
306 |     {%- if messages[0]['role'] == 'system' %}
307 |         {{- messages[0]['content'] }}
308 |     {%- else %}
309 |         {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
310 |     {%- endif %}
311 |     {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
312 |     {%- for tool in tools %}
313 |         {{- "\n" }}
314 |         {{- tool | tojson }}
315 |     {%- endfor %}
316 |     {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{{\"name\": <function-name>, \"arguments\": <args-json-object>}}\n</tool_call><|im_end|>\n" }}
317 | {%- else %}
318 |     {%- if messages[0]['role'] == 'system' %}
319 |         {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
320 |     {%- else %}
321 |         {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
322 |     {%- endif %}
323 | {%- endif %}
324 | {%- for message in messages %}
325 |     {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
326 |         {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
327 |     {%- elif message.role == "assistant" %}
328 |         {{- '<|im_start|>' + message.role }}
329 |         {%- if message.content %}
330 |             {{- '\n' + message.content }}
331 |         {%- endif %}
332 |         {%- for tool_call in message.tool_calls %}
333 |             {%- if tool_call.function is defined %}
334 |                 {%- set tool_call = tool_call.function %}
335 |             {%- endif %}
336 |             {{- '\n<tool_call>\n{"name": "' }}
337 |             {{- tool_call.name }}
338 |             {{- '", "arguments": ' }}
339 |             {{- tool_call.arguments | tojson }}
340 |             {{- '}\n</tool_call>' }}
341 |         {%- endfor %}
342 |         {{- '<|im_end|>\n' }}
343 |     {%- elif message.role == "tool" %}
344 |         {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
345 |             {{- '<|im_start|>user' }}
346 |         {%- endif %}
347 |         {{- '\n<tool_response>\n' }}
348 |         {{- message.content }}
349 |         {{- '\n</tool_response>' }}
350 |         {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
351 |             {{- '<|im_end|>\n' }}
352 |         {%- endif %}
353 |     {%- endif %}
354 | {%- endfor %}
355 | {%- if add_generation_prompt %}
356 |     {{- '<|im_start|>assistant\n' }}
357 | {%- endif %}
358 | 
359 | Using chat eos_token: <|im_end|>
360 | Using chat bos_token: <|endoftext|>
361 | """


--------------------------------------------------------------------------------