├── NtworkGPTbanner.jpg ├── README.md ├── fastOpenAI-API1.gif ├── githubFASTapi.png ├── main.py ├── main2.py ├── networkGPT3.png ├── stapp2.py └── textchat.py /NtworkGPTbanner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fabiomatricardi/ModernFastAPI/6584d3d75acdc70636520818a5a2f8a5b3ab6967/NtworkGPTbanner.jpg -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # ModernFastAPI 6 | Repo of the code from the Medium article - Build a powerful LLM API right on your computer 7 | 8 | This Project has 3 parts: 9 | 1. Create your first FastAPI and interact with it 10 | 2. Create a Streamlit AI app where you use TinyLlama-1B-OpenOrca as Instruction AI you can reach in your Local Network 11 | 3. **Use llama-cpp-python built in API and Streamlit to give your Team a nice Chatbot** (coming soon0 12 | 13 | 14 | ### Here the articles on Medium 15 | 16 | - [Create your LLM API: your ChatBOT as a service — part 1](https://medium.com/generative-ai/create-your-llm-api-your-chatbot-as-a-service-part-1-4d4213182a1a) 17 | - [Create your LLM API: your ChatBOT as a service — part 2](https://generativeai.pub/create-your-llm-api-your-chatbot-as-a-service-part-2-b21eb6efea72) 18 | - [Create your LLM API: ChatBOT as a service — part 3](https://generativeai.pub/create-your-llm-api-chatbot-as-a-service-part-3-ca336d56f0d3) 19 | 20 | ### Part 3 file 21 | This is the python file for the textual interface as described in the part 3 article 22 | 23 | The result will be as shown below (terminal server llama-cpp-python on the left, chat interface on the right): 24 | 25 | -------------------------------------------------------------------------------- /fastOpenAI-API1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fabiomatricardi/ModernFastAPI/6584d3d75acdc70636520818a5a2f8a5b3ab6967/fastOpenAI-API1.gif -------------------------------------------------------------------------------- /githubFASTapi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fabiomatricardi/ModernFastAPI/6584d3d75acdc70636520818a5a2f8a5b3ab6967/githubFASTapi.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # API import Section 2 | from fastapi import FastAPI, Request 3 | import asyncio 4 | # LLM section import 5 | from llama_cpp import Llama 6 | # IMPORTS FOR TEXT GENERATION PIPELINE CHAIN 7 | import copy 8 | 9 | app = FastAPI( 10 | title="Inference API for TinyLlamaOO", 11 | description="A simple API that use TinyLlama OpenOrca as a chatbot", 12 | version="1.0", 13 | ) 14 | 15 | 16 | ### INITIALIZING TINYLLAMA-OpenOrca MODEL 17 | modpath = "model/tinyllama-1.1b-1t-openorca.Q4_K_M.gguf" 18 | llm = Llama( 19 | model_path=modpath, n_gpu_layers=0, 20 | n_ctx=2048, verbose=False, 21 | stop=["<|im_end|>",''], 22 | chat_format="chatml", 23 | ) 24 | 25 | 26 | @app.get('/') 27 | async def hello(): 28 | return {"hello" : "Artificial Intelligence enthusiast"} 29 | 30 | 31 | @app.get('/model') 32 | async def model(): 33 | text = "Who is Tony Stark?" 34 | template = f"""<|im_start|>system\nYou are a helpful ChatBot assistant.<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant""" 35 | res = llm(template) 36 | result = copy.deepcopy(res) 37 | return {"result" : result['choices'][0]['text']} 38 | 39 | 40 | @app.get('/tinyllama') 41 | async def tinyllama(text : str): 42 | template = f"""<|im_start|>system 43 | You are a helpful ChatBot assistant.<|im_end|> 44 | <|im_start|>user 45 | {text}<|im_end|> 46 | <|im_start|>assistant""" 47 | res = llm(template,temperature=0.42,repeat_penalty=1.5,max_tokens=300) 48 | result = copy.deepcopy(res) 49 | return {"result" : result['choices'][0]['text']} -------------------------------------------------------------------------------- /main2.py: -------------------------------------------------------------------------------- 1 | # API import Section 2 | from fastapi import FastAPI, Request 3 | import asyncio 4 | # LLM section import 5 | from llama_cpp import Llama 6 | # IMPORTS FOR TEXT GENERATION PIPELINE CHAIN 7 | import copy 8 | 9 | app = FastAPI( 10 | title="Inference API for TinyLlamaOO Instruct", 11 | description="A simple API that use TinyLlama OpenOrca for Instruction-RAG", 12 | version="2.0", 13 | ) 14 | 15 | ### INITIALIZING TINYLLAMA-OpenOrca MODEL 16 | modpath = "model/tinyllama-1.1b-1t-openorca.Q4_K_M.gguf" 17 | llm = Llama( 18 | model_path=modpath, n_gpu_layers=0, 19 | n_ctx=2048, verbose=False, 20 | stop=["<|im_end|>",''], 21 | chat_format="chatml", 22 | ) 23 | 24 | 25 | @app.get('/') 26 | async def hello(): 27 | return {"hello" : "Artificial Intelligence enthusiast"} 28 | 29 | 30 | @app.get('/model') 31 | async def model(): 32 | text = "Who is Tony Stark?" 33 | template = f"""<|im_start|>system\nYou are a helpful ChatBot assistant.<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant""" 34 | res = llm(template) 35 | result = copy.deepcopy(res) 36 | return {"result" : result['choices'][0]['text']} 37 | 38 | 39 | @app.get('/tinyllama') 40 | async def tinyllama(text : str): 41 | template = f"""<|im_start|>system 42 | You are a helpful ChatBot assistant.<|im_end|> 43 | <|im_start|>user 44 | {text}<|im_end|> 45 | <|im_start|>assistant""" 46 | res = llm(template,temperature=0.42,repeat_penalty=1.5,max_tokens=300) 47 | result = copy.deepcopy(res) 48 | return {"result" : result['choices'][0]['text']} 49 | 50 | 51 | from pydantic import BaseModel 52 | from typing import List 53 | 54 | 55 | class Instruction(BaseModel): 56 | temperature: float| None = 0.1 57 | maxlen: int| None = 150 58 | sysmessage: str 59 | promptmessage : str 60 | 61 | @app.post('/instruct/') 62 | async def instruct(instruction : Instruction): 63 | chattemperature = instruction.temperature 64 | chatlen = instruction.maxlen 65 | template = f"""<|im_start|>system 66 | {instruction.sysmessage}<|im_end|> 67 | <|im_start|>user 68 | {instruction.promptmessage}<|im_end|> 69 | <|im_start|>assistant""" 70 | stops=["<|im_end|>",''] 71 | chat = llm(template,temperature=chattemperature, 72 | stop=stops,repeat_penalty=1.7,max_tokens=chatlen) 73 | ongoingchat = copy.deepcopy(chat) 74 | response = {"result" : ongoingchat['choices'][0]['text']} 75 | print(ongoingchat['choices'][0]['text']) 76 | return response -------------------------------------------------------------------------------- /networkGPT3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fabiomatricardi/ModernFastAPI/6584d3d75acdc70636520818a5a2f8a5b3ab6967/networkGPT3.png -------------------------------------------------------------------------------- /stapp2.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import requests 3 | import ast 4 | from time import sleep 5 | import datetime 6 | 7 | 8 | def writehistory(filename,text): 9 | with open(filename, 'a', encoding='utf-8') as f: 10 | f.write(text) 11 | f.write('\n') 12 | f.close() 13 | 14 | # Function to POST on the FastAPI EndPoint 15 | def get_reply(temperature, maxlen, sysmessage, promptmessage): 16 | API_URL = "http://127.0.0.1:8000/instruct/" 17 | headers = {} 18 | payloads = { 19 | "temperature" : temperature, 20 | "maxlen" : maxlen, 21 | "sysmessage" : sysmessage, 22 | "promptmessage" : promptmessage 23 | } 24 | response = requests.post(API_URL, headers=headers, json=payloads) 25 | risposta = response.content.decode("utf-8") 26 | import ast 27 | res = ast.literal_eval(risposta) 28 | return res 29 | 30 | 31 | # Set the webpage title 32 | st.set_page_config( 33 | page_title="Your own 🕸️ NetworkGPT", 34 | page_icon="🐋") 35 | 36 | # Create a header element 37 | st.header("Your own NetworkGPT with 🦙TinyLlama OpenOrca🐋") 38 | st.markdown("#### :green[*tinyllama-1.1b-1t-openorca.Q4_K_M.gguf - the best tiny model?*]") 39 | 40 | if "logfilename" not in st.session_state: 41 | ## Logger file 42 | tstamp = datetime.datetime.now() 43 | tstamp = str(tstamp).replace(' ','_') 44 | tstamp = str(tstamp).replace(':','_') 45 | logfile = f'{tstamp[:-7]}_log.txt' 46 | st.session_state.logfilename = logfile 47 | #Write in the history the first 2 sessions 48 | writehistory(st.session_state.logfilename,f'🧠🫡: You are a helpful assistant.') 49 | writehistory(st.session_state.logfilename,f'🐋: How may I help you today?\-------------------------\n') 50 | 51 | if "sysmessage" not in st.session_state: 52 | st.session_state.sysmessage = "" 53 | 54 | if "promptmessage" not in st.session_state: 55 | st.session_state.promptmessage = 0 56 | 57 | if "maxlen" not in st.session_state: 58 | st.session_state.maxlen = 200 59 | 60 | if "temperature" not in st.session_state: 61 | st.session_state.temperature = 0.1 62 | 63 | with st.sidebar: 64 | st.markdown("""### Parameters:""", unsafe_allow_html=True) 65 | st.session_state.temperature = st.slider('Temperature:', min_value=0.00, max_value=1.0, value=0.1, step=0.02) 66 | st.session_state.maxlen = st.slider('MaxLength:', min_value=50, max_value=500, value=200, step=5) 67 | st.markdown("---") 68 | st.markdown("### Logfile") 69 | st.markdown(st.session_state.logfilename) 70 | 71 | 72 | st.session_state.sysmessage = st.text_area('System Message', value="", height=20) 73 | st.session_state.promptmessage = st.text_area('User Message', value="", height=170) 74 | btn = st.button('Ask TinyLlama', type='primary') 75 | resultarea = st.empty() 76 | resultarea.write("Reply will go here...") 77 | st.write('---') 78 | 79 | if btn: 80 | log = f'SYS: {st.session_state.sysmessage}\nUSER: {st.session_state.promptmessage}' 81 | writehistory(st.session_state.logfilename,log) 82 | response = get_reply(st.session_state.temperature,st.session_state.maxlen, 83 | st.session_state.sysmessage,st.session_state.promptmessage) 84 | resultarea.markdown(response['result']) 85 | log = f"TINYLLAMA: {response['result']}\n---\n\n" 86 | writehistory(st.session_state.logfilename,log) 87 | 88 | -------------------------------------------------------------------------------- /textchat.py: -------------------------------------------------------------------------------- 1 | # Chat with an intelligent assistant in your terminal 2 | from openai import OpenAI 3 | 4 | # Point to the local server 5 | # Change localhost with the IP ADDRESS of the computer acting as a server 6 | # itmay be something like "http://192.168.1.52:8000/v1" 7 | client = OpenAI(base_url="http://localhost:8000/v1", 8 | api_key="not-needed") 9 | history = [ 10 | {"role": "system", "content": "You are an intelligent assistant. You always provide well-reasoned answers that are both correct and helpful."}, 11 | {"role": "user", "content": "Hello, introduce yourself to someone opening this program for the first time. Be concise."}, 12 | ] 13 | print("\033[92;1m") 14 | while True: 15 | conv_messages = [] 16 | len_context = len(history) 17 | if len_context > 13: 18 | print("\033[93;1m") 19 | print('Limiter passed') 20 | print("\033[92;1m") 21 | x=13-4 22 | conv_messages.append(history[0]) 23 | for i in range(0,x): 24 | conv_messages.append(history[-x+i]) 25 | else: 26 | conv_messages = history 27 | completion = client.chat.completions.create( 28 | model="local-model", # this field is currently unused 29 | messages=conv_messages, 30 | temperature=0.7, 31 | stream=True, 32 | ) 33 | 34 | new_message = {"role": "assistant", "content": ""} 35 | # the first generation is based on the initial messages 36 | for chunk in completion: 37 | if chunk.choices[0].delta.content: 38 | print(chunk.choices[0].delta.content, end="", flush=True) 39 | new_message["content"] += chunk.choices[0].delta.content 40 | 41 | history.append(new_message) 42 | # here we ask the user input and we check if we want to exit the program 43 | print("\033[91;1m") 44 | userinput = input("> ") 45 | if userinput.lower() in ["quit", "exit"]: 46 | print("\033[0mBYE BYE!") 47 | break 48 | history.append({"role": "user", "content": userinput}) 49 | print("\033[92;1m") 50 | --------------------------------------------------------------------------------