├── .gitattributes ├── .gitignore ├── Dockerfile ├── README.md ├── app ├── fastapi_server.py ├── high_level_api_embedding.py ├── high_level_api_inference.py ├── high_level_api_streaming.py └── langchain_custom_llm.py ├── docker-compose.yml └── requirements.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | models 3 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/devcontainers/python:0-3.11 2 | 3 | COPY requirements.txt /tmp/pip-tmp/ 4 | RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \ 5 | && rm -rf /tmp/pip-tmp 6 | 7 | WORKDIR /app 8 | COPY app/ ./ 9 | 10 | EXPOSE 8000 11 | ENTRYPOINT [ "uvicorn", "--reload", "--host", "0.0.0.0", "fastapi_server:app" ] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gpt4all-ui 2 | 3 | Simple Docker Compose to load gpt4all (Llama.cpp) as an API and chatbot-ui for the web interface. This mimics OpenAI's ChatGPT but as a local instance (offline). 4 | 5 | Screenshot 2023-04-06 at 10 40 40 PM 6 | 7 | ### Clone the repositor (with submodules) 8 | 9 | ``` 10 | git clone https://github.com/mkellerman/gpt4all-ui.git 11 | cd gpt4all-ui 12 | ``` 13 | 14 | ### Copy your Models in the workspace 15 | 16 | Models should be copied to `models` folder in the root of the `gpt4all-ui` folder. 17 | 18 | If you have your models located somewhere else, you can create a alias (link) to your folder. 19 | ``` 20 | ln -s /Users/Me/Documents/Models models 21 | ``` 22 | 23 | ### Start it up 24 | 25 | ``` 26 | docker-compose up -d 27 | ``` 28 | 29 | And open http://localhost:3000 30 | 31 | ### Sources: 32 | 33 | chatboi-ui (web): https://github.com/mckaywrigley/chatbot-ui 34 | 35 | llama-cpp-python (api): https://github.com/abetlen/llama-cpp-python 36 | -------------------------------------------------------------------------------- /app/fastapi_server.py: -------------------------------------------------------------------------------- 1 | """Example FastAPI server for llama.cpp. 2 | 3 | To run this example: 4 | 5 | ```bash 6 | pip install fastapi uvicorn sse-starlette 7 | export MODEL=../models/7B/... 8 | uvicorn fastapi_server_chat:app --reload 9 | ``` 10 | 11 | Then visit http://localhost:8000/docs to see the interactive API docs. 12 | 13 | """ 14 | import os 15 | import json 16 | from typing import List, Optional, Literal, Union, Iterator, Dict 17 | from typing_extensions import TypedDict 18 | 19 | import llama_cpp 20 | 21 | from fastapi import FastAPI 22 | from fastapi.middleware.cors import CORSMiddleware 23 | from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict 24 | from sse_starlette.sse import EventSourceResponse 25 | 26 | 27 | class Settings(BaseSettings): 28 | model: str 29 | n_ctx: int = 2048 30 | n_batch: int = 8 31 | n_threads: int = int(os.cpu_count() / 2) or 1 32 | f16_kv: bool = True 33 | use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... 34 | embedding: bool = True 35 | last_n_tokens_size: int = 64 36 | 37 | 38 | app = FastAPI( 39 | title="🦙 llama.cpp Python API", 40 | version="0.0.1", 41 | ) 42 | app.add_middleware( 43 | CORSMiddleware, 44 | allow_origins=["*"], 45 | allow_credentials=True, 46 | allow_methods=["*"], 47 | allow_headers=["*"], 48 | ) 49 | settings = Settings() 50 | llama = llama_cpp.Llama( 51 | settings.model, 52 | f16_kv=settings.f16_kv, 53 | use_mlock=settings.use_mlock, 54 | embedding=settings.embedding, 55 | n_threads=settings.n_threads, 56 | n_batch=settings.n_batch, 57 | n_ctx=settings.n_ctx, 58 | last_n_tokens_size=settings.last_n_tokens_size, 59 | ) 60 | 61 | 62 | class CreateCompletionRequest(BaseModel): 63 | prompt: str 64 | suffix: Optional[str] = Field(None) 65 | max_tokens: int = 16 66 | temperature: float = 0.8 67 | top_p: float = 0.95 68 | echo: bool = False 69 | stop: List[str] = [] 70 | stream: bool = False 71 | 72 | # ignored or currently unsupported 73 | model: Optional[str] = Field(None) 74 | n: Optional[int] = 1 75 | logprobs: Optional[int] = Field(None) 76 | presence_penalty: Optional[float] = 0 77 | frequency_penalty: Optional[float] = 0 78 | best_of: Optional[int] = 1 79 | logit_bias: Optional[Dict[str, float]] = Field(None) 80 | user: Optional[str] = Field(None) 81 | 82 | # llama.cpp specific parameters 83 | top_k: int = 40 84 | repeat_penalty: float = 1.1 85 | 86 | class Config: 87 | schema_extra = { 88 | "example": { 89 | "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", 90 | "stop": ["\n", "###"], 91 | } 92 | } 93 | 94 | 95 | CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) 96 | 97 | 98 | @app.post( 99 | "/v1/completions", 100 | response_model=CreateCompletionResponse, 101 | ) 102 | def create_completion(request: CreateCompletionRequest): 103 | if request.stream: 104 | chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict()) # type: ignore 105 | return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) 106 | return llama( 107 | **request.dict( 108 | exclude={ 109 | "model", 110 | "n", 111 | "logprobs", 112 | "frequency_penalty", 113 | "presence_penalty", 114 | "best_of", 115 | "logit_bias", 116 | "user", 117 | } 118 | ) 119 | ) 120 | 121 | 122 | class CreateEmbeddingRequest(BaseModel): 123 | model: Optional[str] 124 | input: str 125 | user: Optional[str] 126 | 127 | class Config: 128 | schema_extra = { 129 | "example": { 130 | "input": "The food was delicious and the waiter...", 131 | } 132 | } 133 | 134 | 135 | CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) 136 | 137 | 138 | @app.post( 139 | "/v1/embeddings", 140 | response_model=CreateEmbeddingResponse, 141 | ) 142 | def create_embedding(request: CreateEmbeddingRequest): 143 | return llama.create_embedding(**request.dict(exclude={"model", "user"})) 144 | 145 | 146 | class ChatCompletionRequestMessage(BaseModel): 147 | role: Union[Literal["system"], Literal["user"], Literal["assistant"]] 148 | content: str 149 | user: Optional[str] = None 150 | 151 | 152 | class CreateChatCompletionRequest(BaseModel): 153 | model: Optional[str] 154 | messages: List[ChatCompletionRequestMessage] 155 | temperature: float = 0.8 156 | top_p: float = 0.95 157 | stream: bool = False 158 | stop: List[str] = [] 159 | max_tokens: int = 128 160 | 161 | # ignored or currently unsupported 162 | model: Optional[str] = Field(None) 163 | n: Optional[int] = 1 164 | presence_penalty: Optional[float] = 0 165 | frequency_penalty: Optional[float] = 0 166 | logit_bias: Optional[Dict[str, float]] = Field(None) 167 | user: Optional[str] = Field(None) 168 | 169 | # llama.cpp specific parameters 170 | repeat_penalty: float = 1.1 171 | 172 | class Config: 173 | schema_extra = { 174 | "example": { 175 | "messages": [ 176 | ChatCompletionRequestMessage( 177 | role="system", content="You are a helpful assistant." 178 | ), 179 | ChatCompletionRequestMessage( 180 | role="user", content="What is the capital of France?" 181 | ), 182 | ] 183 | } 184 | } 185 | 186 | 187 | CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) 188 | 189 | 190 | @app.post( 191 | "/v1/chat/completions", 192 | response_model=CreateChatCompletionResponse, 193 | ) 194 | async def create_chat_completion( 195 | request: CreateChatCompletionRequest, 196 | ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: 197 | completion_or_chunks = llama.create_chat_completion( 198 | **request.dict( 199 | exclude={ 200 | "model", 201 | "n", 202 | "presence_penalty", 203 | "frequency_penalty", 204 | "logit_bias", 205 | "user", 206 | } 207 | ), 208 | ) 209 | 210 | if request.stream: 211 | 212 | async def server_sent_events( 213 | chat_chunks: Iterator[llama_cpp.ChatCompletionChunk], 214 | ): 215 | for chat_chunk in chat_chunks: 216 | yield dict(data=json.dumps(chat_chunk)) 217 | yield dict(data="[DONE]") 218 | 219 | chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore 220 | 221 | return EventSourceResponse( 222 | server_sent_events(chunks), 223 | ) 224 | completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore 225 | return completion 226 | 227 | 228 | class ModelData(TypedDict): 229 | id: str 230 | object: Literal["model"] 231 | owned_by: str 232 | permissions: List[str] 233 | 234 | 235 | class ModelList(TypedDict): 236 | object: Literal["list"] 237 | data: List[ModelData] 238 | 239 | 240 | GetModelResponse = create_model_from_typeddict(ModelList) 241 | 242 | 243 | @app.get("/v1/models", response_model=GetModelResponse) 244 | def get_models() -> ModelList: 245 | return { 246 | "object": "list", 247 | "data": [ 248 | { 249 | "id": llama.model_path, 250 | "object": "model", 251 | "owned_by": "me", 252 | "permissions": [], 253 | } 254 | ], 255 | } 256 | 257 | 258 | if __name__ == "__main__": 259 | import os 260 | import uvicorn 261 | 262 | uvicorn.run(app, host=os.getenv("HOST", "localhost"), port=os.getenv("PORT", 8000)) 263 | -------------------------------------------------------------------------------- /app/high_level_api_embedding.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from llama_cpp import Llama 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("-m", "--model", type=str, default=".//models/...") 7 | args = parser.parse_args() 8 | 9 | llm = Llama(model_path=args.model, embedding=True) 10 | 11 | print(llm.create_embedding("Hello world!")) 12 | -------------------------------------------------------------------------------- /app/high_level_api_inference.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | from llama_cpp import Llama 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("-m", "--model", type=str, default="./models/...") 8 | args = parser.parse_args() 9 | 10 | llm = Llama(model_path=args.model) 11 | 12 | output = llm( 13 | "Question: What are the names of the planets in the solar system? Answer: ", 14 | max_tokens=48, 15 | stop=["Q:", "\n"], 16 | echo=True, 17 | ) 18 | 19 | print(json.dumps(output, indent=2)) 20 | -------------------------------------------------------------------------------- /app/high_level_api_streaming.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | from llama_cpp import Llama 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("-m", "--model", type=str, default="./models/...") 8 | args = parser.parse_args() 9 | 10 | llm = Llama(model_path=args.model) 11 | 12 | stream = llm( 13 | "Question: What are the names of the planets in the solar system? Answer: ", 14 | max_tokens=48, 15 | stop=["Q:", "\n"], 16 | stream=True, 17 | ) 18 | 19 | for output in stream: 20 | print(json.dumps(output, indent=2)) 21 | -------------------------------------------------------------------------------- /app/langchain_custom_llm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from llama_cpp import Llama 4 | 5 | from langchain.llms.base import LLM 6 | from typing import Optional, List, Mapping, Any 7 | 8 | 9 | class LlamaLLM(LLM): 10 | model_path: str 11 | llm: Llama 12 | 13 | @property 14 | def _llm_type(self) -> str: 15 | return "llama-cpp-python" 16 | 17 | def __init__(self, model_path: str, **kwargs: Any): 18 | model_path = model_path 19 | llm = Llama(model_path=model_path) 20 | super().__init__(model_path=model_path, llm=llm, **kwargs) 21 | 22 | def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: 23 | response = self.llm(prompt, stop=stop or []) 24 | return response["choices"][0]["text"] 25 | 26 | @property 27 | def _identifying_params(self) -> Mapping[str, Any]: 28 | return {"model_path": self.model_path} 29 | 30 | 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument("-m", "--model", type=str, default="./models/...") 33 | args = parser.parse_args() 34 | 35 | # Load the model 36 | llm = LlamaLLM(model_path=args.model) 37 | 38 | # Basic Q&A 39 | answer = llm( 40 | "Question: What is the capital of France? Answer: ", stop=["Question:", "\n"] 41 | ) 42 | print(f"Answer: {answer.strip()}") 43 | 44 | # Using in a chain 45 | from langchain.prompts import PromptTemplate 46 | from langchain.chains import LLMChain 47 | 48 | prompt = PromptTemplate( 49 | input_variables=["product"], 50 | template="\n\n### Instruction:\nWrite a good name for a company that makes {product}\n\n### Response:\n", 51 | ) 52 | chain = LLMChain(llm=llm, prompt=prompt) 53 | 54 | # Run the chain only specifying the input variable. 55 | print(chain.run("colorful socks")) 56 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | 3 | services: 4 | 5 | chatgpt: 6 | image: ghcr.io/mckaywrigley/chatbot-ui:main 7 | platform: linux/amd64 8 | ports: 9 | - 3000:3000 10 | environment: 11 | - 'OPENAI_API_KEY=000000000000000' 12 | - 'OPENAI_API_HOST=http://api:8000' 13 | 14 | api: 15 | build: . 16 | volumes: 17 | - ./models:/models 18 | ports: 19 | - 8000:8000 20 | environment: 21 | - MODEL=/models/7B/gpt4all-lora-quantized.ggml 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | llama-cpp-python 2 | uvicorn 3 | fastapi 4 | sse_starlette 5 | typing_extensions --------------------------------------------------------------------------------