├── 1. openaitest.py ├── 2. main.py ├── 3. instructor.py ├── 4. multimodal.py ├── README.md ├── app.py ├── config.json └── stock_data.py /1. openaitest.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | from colorama import init 3 | from colorama import Fore, Back, Style 4 | import time 5 | 6 | 7 | init() 8 | 9 | client = OpenAI( 10 | base_url="http://localhost:8000/v1", 11 | api_key="123", 12 | ) 13 | 14 | time.sleep(5) 15 | 16 | prompts = [ 17 | "what is ROI in the context of finance, provide a worked example?", 18 | "define the efficient frontier in the context of finance", 19 | "what is glass stegal?", 20 | "how does derivative pricing work?", 21 | ] 22 | 23 | 24 | for prompt in prompts: 25 | print(Fore.LIGHTMAGENTA_EX + prompt, end="\n") 26 | response = client.chat.completions.create( 27 | model="llama.cpp/models/mixtral-8x7b-instruct-v0.1.Q2_K.gguf", 28 | messages=[ 29 | { 30 | "role": "user", 31 | "content": prompt, 32 | } 33 | ], 34 | stream=True, 35 | max_tokens=20, 36 | ) 37 | for chunk in response: 38 | if chunk.choices[0].delta.content is not None: 39 | print( 40 | Fore.LIGHTBLUE_EX + chunk.choices[0].delta.content, 41 | end="", 42 | flush=True, 43 | ) 44 | print("\n") 45 | -------------------------------------------------------------------------------- /2. main.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import streamlit as st 3 | 4 | client = OpenAI( 5 | base_url="http://localhost:8000/v1", 6 | api_key="123", 7 | ) 8 | 9 | if "messages" not in st.session_state: 10 | st.session_state["messages"] = [ 11 | { 12 | "role": "system", 13 | "content": """You are a helpful assistant. If you do not know the answer, reply I don't know 14 | don't make things up.""", 15 | } 16 | ] 17 | 18 | st.title("🚀 LLaMa CPP Python") 19 | for message in st.session_state.messages: 20 | st.chat_message(message["role"]).markdown(message["content"]) 21 | 22 | prompt = st.chat_input("Pass your input here") 23 | if prompt: 24 | st.chat_message("user").markdown(prompt) 25 | st.session_state.messages.append({"role": "user", "content": prompt}) 26 | 27 | response = client.chat.completions.create( 28 | model="llama.cpp/models/mixtral-8x7b-instruct-v0.1.Q2_K.gguf", 29 | messages=st.session_state.messages, 30 | stream=True, 31 | ) 32 | 33 | complete_response = "" 34 | with st.chat_message("assistant"): 35 | message_placeholder = st.empty() 36 | for chunk in response: 37 | if chunk.choices[0].delta.content is not None: 38 | complete_response += chunk.choices[0].delta.content 39 | message_placeholder.markdown(complete_response + "▌") 40 | message_placeholder.markdown(complete_response) 41 | st.session_state.messages.append( 42 | {"role": "assistant", "content": complete_response} 43 | ) 44 | -------------------------------------------------------------------------------- /3. instructor.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import streamlit as st 3 | 4 | import instructor 5 | from pydantic import BaseModel 6 | 7 | client = OpenAI( 8 | base_url="http://localhost:8000/v1", 9 | api_key="123", 10 | ) 11 | 12 | # Enables `response_model` 13 | client = instructor.patch(client=client) 14 | 15 | 16 | class UserDetail(BaseModel): 17 | stock_ticker: str 18 | start_date: int 19 | end_date: str 20 | 21 | 22 | if "messages" not in st.session_state: 23 | st.session_state["messages"] = [ 24 | { 25 | "role": "system", 26 | "content": """You are a helpful assistant. If you do not know the answer, reply I don't know 27 | don't make things up.""", 28 | } 29 | ] 30 | 31 | st.title("🚀 LLaMa CPP Python") 32 | for message in st.session_state.messages: 33 | st.chat_message(message["role"]).markdown(message["content"]) 34 | 35 | prompt = st.chat_input("Pass your input here") 36 | if prompt: 37 | st.chat_message("user").markdown(prompt) 38 | st.session_state.messages.append({"role": "user", "content": prompt}) 39 | 40 | response = client.chat.completions.create( 41 | max_tokens=-1, 42 | model="mistral-function-calling", 43 | response_model=UserDetail, 44 | messages=[ 45 | { 46 | "role": "user", 47 | "content": prompt, 48 | }, 49 | ], 50 | ) 51 | 52 | complete_response = "" 53 | with st.chat_message("assistant"): 54 | message_placeholder = st.empty() 55 | for chunk in response: 56 | st.write(chunk) 57 | 58 | st.session_state.messages.append( 59 | {"role": "assistant", "content": complete_response} 60 | ) 61 | -------------------------------------------------------------------------------- /4. multimodal.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import streamlit as st 3 | 4 | import instructor 5 | from pydantic import BaseModel 6 | 7 | client = OpenAI( 8 | base_url="http://localhost:8000/v1", 9 | api_key="123", 10 | ) 11 | 12 | # # Enables `response_model` 13 | # client = instructor.patch(client=client) 14 | 15 | 16 | # class UserDetail(BaseModel): 17 | # name: str 18 | # age: int 19 | # job: str 20 | 21 | 22 | if "messages" not in st.session_state: 23 | st.session_state["messages"] = [ 24 | { 25 | "role": "system", 26 | "content": """You are a helpful assistant. If you do not know the answer, reply I don't know 27 | don't make things up.""", 28 | } 29 | ] 30 | 31 | st.title("🚀 LLaMa CPP Python") 32 | for message in st.session_state.messages: 33 | st.chat_message(message["role"]).markdown(message["content"]) 34 | 35 | prompt = st.chat_input("Pass your input here") 36 | if prompt: 37 | st.chat_message("user").markdown(prompt) 38 | st.session_state.messages.append({"role": "user", "content": prompt}) 39 | 40 | response = client.chat.completions.create( 41 | max_tokens=-1, 42 | model="gpt-4-vision-preview", 43 | messages=[ 44 | { 45 | "role": "user", 46 | "content": [ 47 | { 48 | "type": "image_url", 49 | "image_url": { 50 | "url": "https://www.rhodeahead.com/sites/rhodeahead.com/files/field/image/ra_comm_prescription_label-500.jpg" 51 | }, 52 | }, 53 | {"type": "text", "text": prompt}, 54 | ], 55 | } 56 | ], 57 | ) 58 | 59 | complete_response = "" 60 | with st.chat_message("assistant"): 61 | message_placeholder = st.empty() 62 | for chunk in response: 63 | st.write(chunk) 64 | 65 | st.session_state.messages.append( 66 | {"role": "assistant", "content": complete_response} 67 | ) 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLaMA 2 | Run sick LLM apps hyper fast on your local machine for funzies. 3 | 4 | ## See it live and in action 📺 5 | 6 | 7 | 8 | # Startup 🚀 9 | 1. Git clone https://github.com/ggerganov/llama.cpp 10 | 2. Run the make commands: 11 | - Mac: `cd llama.cpp && make` 12 | - Windows (from here ): 13 | 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 14 | 2. Extract `w64devkit` on your pc. 15 | 3. Run `w64devkit.exe`. 16 | 4. Use the `cd` command to reach the `llama.cpp` folder. 17 | 5. From here you can run: 18 | ```bash 19 | make 20 | ``` 21 | 3. pip install openai 'llama-cpp-python[server]' pydantic instructor streamlit 22 | 4. Start the server: 23 | - Single Model Chat
24 | `python -m --model models/mistral-7b-instruct-v0.1.Q4_0.gguf ` 25 | - Single Model Chat with GPU Offload
26 | `python -m --model models/mistral-7b-instruct-v0.1.Q4_0.gguf --n_gpu -1` 27 | - Single Model Function Calling with GPU Offload
28 | `python -m --model models/mistral-7b-instruct-v0.1.- Q4_0.gguf --n_gpu -1 --chat functionary` 29 | - Multiple Model Load with Config
30 | `python -m --config_file config.json` 31 | - Multi Modal Models
32 | `python -m llama_cpp.server --model models/llava-v1.5-7b-Q4_K.gguf --clip_model_path models/llava-v1.5-7b-mmproj-Q4_0.gguf --n_gpu -1 --chat llava-1-5`
33 | 34 | # Models Used 🤖 35 | - Mistral: https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF 36 | - Mixtral: https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF 37 | - LLaVa: https://huggingface.co/jartine/llava-v1.5-7B-GGUF/tree/main 38 | 39 | # Who, When, Why? 40 | 41 | 👨🏾‍💻 Author: Nick Renotte
42 | 📅 Version: 1.x
43 | 📜 License: This project is licensed under the MIT License
44 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Streamlit the app framework 4 | import streamlit as st 5 | 6 | # Bring the instructor library 7 | import instructor 8 | 9 | # Bring in the Base Model class 10 | from pydantic import BaseModel 11 | 12 | # Bring in the stock prices function 13 | from stock_data import get_stock_prices 14 | 15 | # Create a client 16 | client = OpenAI(api_key="jhjhjh1234", base_url="http://localhost:8000/v1") 17 | # Create a patched client 18 | client = instructor.patch(client=client) 19 | 20 | 21 | # Structure what want extracted 22 | class ResponseModel(BaseModel): 23 | ticker: str 24 | days: int 25 | 26 | 27 | # The title of the app 28 | st.title("🚀 Fake OpenAI Server App (...llama cpp)") 29 | prompt = st.chat_input("Pass your prompt here") 30 | 31 | # If the user types a prompt and hits enter 32 | if prompt: 33 | st.chat_message("user").markdown(prompt) 34 | 35 | # Function calling LLM call 36 | response = client.chat.completions.create( 37 | # which model we want to use 38 | model="mistral-function-calling", 39 | # pass through our prompt 40 | messages=[{"role": "user", "content": prompt}], 41 | # Add stream 42 | # stream=True, 43 | response_model=ResponseModel, 44 | ) 45 | 46 | st.chat_message("ai").markdown(response) 47 | 48 | try: 49 | prices = get_stock_prices(response.ticker, response.days) 50 | st.chat_message("ai").markdown(prices) 51 | 52 | # Summary output prompt + prices 53 | fullresponse = client.chat.completions.create( 54 | # which model we want to use 55 | model="mixtral", 56 | # pass through our prompt 57 | messages=[{"role": "user", "content": prompt + "\n" + str(prices)}], 58 | # Add stream 59 | stream=True, 60 | ) 61 | 62 | with st.chat_message("ai"): 63 | completed_message = "" 64 | message = st.empty() 65 | # Streaming the response out 66 | for chunk in fullresponse: 67 | # If the value is not none print it out 68 | if chunk.choices[0].delta.content is not None: 69 | completed_message += chunk.choices[0].delta.content 70 | message.markdown(completed_message) 71 | # print(chunk.choices[0].delta.content, flush=True, end="") 72 | 73 | except Exception as e: 74 | st.chat_message("ai").markdown("Something went wrong 😭") 75 | 76 | # with st.chat_message("ai"): 77 | # completed_message = "" 78 | # message = st.empty() 79 | # # Streaming the response out 80 | # for chunk in response: 81 | # # If the value is not none print it out 82 | # if chunk.choices[0].delta.content is not None: 83 | # completed_message += chunk.choices[0].delta.content 84 | # message.markdown(completed_message) 85 | # # print(chunk.choices[0].delta.content, flush=True, end="") 86 | 87 | # Print it out 88 | # print(response.choices[0].message.content) 89 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "host": "0.0.0.0", 3 | "port": 8000, 4 | "models": [ 5 | { 6 | "model": "models/mistral-7b-instruct-v0.1.Q4_0.gguf", 7 | "model_alias": "mistral", 8 | "chat_format": "chatml", 9 | "n_gpu_layers": -1, 10 | "offload_kqv": true, 11 | "n_threads": 12, 12 | "n_batch": 512, 13 | "n_ctx": 2048 14 | }, 15 | { 16 | "model": "models/mixtral-8x7b-instruct-v0.1.Q2_K.gguf", 17 | "model_alias": "mixtral", 18 | "chat_format": "chatml", 19 | "n_gpu_layers": -1, 20 | "offload_kqv": true, 21 | "n_threads": 12, 22 | "n_batch": 512, 23 | "n_ctx": 2048 24 | }, 25 | { 26 | "model": "models/mistral-7b-instruct-v0.1.Q4_0.gguf", 27 | "model_alias": "mistral-function-calling", 28 | "chat_format": "functionary", 29 | "n_gpu_layers": -1, 30 | "offload_kqv": true, 31 | "n_threads": 12, 32 | "n_batch": 512, 33 | "n_ctx": 2048 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /stock_data.py: -------------------------------------------------------------------------------- 1 | import yfinance as yf 2 | import json 3 | from datetime import datetime 4 | 5 | 6 | def get_stock_prices(ticker, days): 7 | try: 8 | # Fetch stock data 9 | stock_data = yf.download(ticker, period=f"{days}d", interval="1d") 10 | 11 | # Format the DateTimeIndex to dd/mm/yyyy format 12 | stock_data.index = stock_data.index.strftime("%d/%m/%Y") 13 | 14 | # Convert to JSON format, ensuring dates are strings 15 | stock_json = stock_data.to_json(orient="index") 16 | 17 | # Parse JSON string to JSON object 18 | stock_prices = json.loads(stock_json) 19 | 20 | return stock_prices 21 | 22 | except Exception as e: 23 | return {"error": str(e)} 24 | 25 | 26 | # Example usage: 27 | ticker = "AAPL" # Example ticker 28 | days = 30 # Example number of days 29 | prices = get_stock_prices(ticker, days) 30 | print(prices) 31 | --------------------------------------------------------------------------------