├── .gitattributes
├── .gitignore
├── Dockerfile
├── README.md
├── app
    ├── fastapi_server.py
    ├── high_level_api_embedding.py
    ├── high_level_api_inference.py
    ├── high_level_api_streaming.py
    └── langchain_custom_llm.py
├── docker-compose.yml
└── requirements.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | models
3 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/devcontainers/python:0-3.11
 2 | 
 3 | COPY requirements.txt /tmp/pip-tmp/
 4 | RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
 5 |   && rm -rf /tmp/pip-tmp
 6 | 
 7 | WORKDIR /app
 8 | COPY app/ ./
 9 | 
10 | EXPOSE 8000
11 | ENTRYPOINT [ "uvicorn", "--reload", "--host", "0.0.0.0", "fastapi_server:app" ]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gpt4all-ui
 2 | 
 3 | Simple Docker Compose to load gpt4all (Llama.cpp) as an API and chatbot-ui for the web interface. This mimics OpenAI's ChatGPT but as a local instance (offline).
 4 | 
 5 | <img width="1664" alt="Screenshot 2023-04-06 at 10 40 40 PM" src="https://user-images.githubusercontent.com/9411143/230548570-2624d9bd-c1e1-4e28-99d6-dee8f027f899.png">
 6 | 
 7 | ### Clone the repositor (with submodules)
 8 | 
 9 | ```
10 | git clone https://github.com/mkellerman/gpt4all-ui.git
11 | cd gpt4all-ui
12 | ```
13 | 
14 | ### Copy your Models in the workspace
15 | 
16 | Models should be copied to `models` folder in the root of the `gpt4all-ui` folder. 
17 | 
18 | If you have your models located somewhere else, you can create a alias (link) to your folder.
19 | ```
20 | ln -s /Users/Me/Documents/Models models
21 | ```
22 | 
23 | ### Start it up
24 | 
25 | ```
26 | docker-compose up -d
27 | ```
28 | 
29 | And open http://localhost:3000
30 | 
31 | ### Sources:
32 | 
33 | chatboi-ui (web): https://github.com/mckaywrigley/chatbot-ui
34 | 
35 | llama-cpp-python (api): https://github.com/abetlen/llama-cpp-python
36 | 


--------------------------------------------------------------------------------
/app/fastapi_server.py:
--------------------------------------------------------------------------------
  1 | """Example FastAPI server for llama.cpp.
  2 | 
  3 | To run this example:
  4 | 
  5 | ```bash
  6 | pip install fastapi uvicorn sse-starlette
  7 | export MODEL=../models/7B/...
  8 | uvicorn fastapi_server_chat:app --reload
  9 | ```
 10 | 
 11 | Then visit http://localhost:8000/docs to see the interactive API docs.
 12 | 
 13 | """
 14 | import os
 15 | import json
 16 | from typing import List, Optional, Literal, Union, Iterator, Dict
 17 | from typing_extensions import TypedDict
 18 | 
 19 | import llama_cpp
 20 | 
 21 | from fastapi import FastAPI
 22 | from fastapi.middleware.cors import CORSMiddleware
 23 | from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
 24 | from sse_starlette.sse import EventSourceResponse
 25 | 
 26 | 
 27 | class Settings(BaseSettings):
 28 |     model: str
 29 |     n_ctx: int = 2048
 30 |     n_batch: int = 8
 31 |     n_threads: int = int(os.cpu_count() / 2) or 1
 32 |     f16_kv: bool = True
 33 |     use_mlock: bool = False     # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
 34 |     embedding: bool = True
 35 |     last_n_tokens_size: int = 64
 36 | 
 37 | 
 38 | app = FastAPI(
 39 |     title="🦙 llama.cpp Python API",
 40 |     version="0.0.1",
 41 | )
 42 | app.add_middleware(
 43 |     CORSMiddleware,
 44 |     allow_origins=["*"],
 45 |     allow_credentials=True,
 46 |     allow_methods=["*"],
 47 |     allow_headers=["*"],
 48 | )
 49 | settings = Settings()
 50 | llama = llama_cpp.Llama(
 51 |     settings.model,
 52 |     f16_kv=settings.f16_kv,
 53 |     use_mlock=settings.use_mlock,
 54 |     embedding=settings.embedding,
 55 |     n_threads=settings.n_threads,
 56 |     n_batch=settings.n_batch,
 57 |     n_ctx=settings.n_ctx,
 58 |     last_n_tokens_size=settings.last_n_tokens_size,
 59 | )
 60 | 
 61 | 
 62 | class CreateCompletionRequest(BaseModel):
 63 |     prompt: str
 64 |     suffix: Optional[str] = Field(None)
 65 |     max_tokens: int = 16
 66 |     temperature: float = 0.8
 67 |     top_p: float = 0.95
 68 |     echo: bool = False
 69 |     stop: List[str] = []
 70 |     stream: bool = False
 71 | 
 72 |     # ignored or currently unsupported
 73 |     model: Optional[str] = Field(None)
 74 |     n: Optional[int] = 1
 75 |     logprobs: Optional[int] = Field(None)
 76 |     presence_penalty: Optional[float] = 0
 77 |     frequency_penalty: Optional[float] = 0
 78 |     best_of: Optional[int] = 1
 79 |     logit_bias: Optional[Dict[str, float]] = Field(None)
 80 |     user: Optional[str] = Field(None)
 81 | 
 82 |     # llama.cpp specific parameters
 83 |     top_k: int = 40
 84 |     repeat_penalty: float = 1.1
 85 | 
 86 |     class Config:
 87 |         schema_extra = {
 88 |             "example": {
 89 |                 "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
 90 |                 "stop": ["\n", "###"],
 91 |             }
 92 |         }
 93 | 
 94 | 
 95 | CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
 96 | 
 97 | 
 98 | @app.post(
 99 |     "/v1/completions",
100 |     response_model=CreateCompletionResponse,
101 | )
102 | def create_completion(request: CreateCompletionRequest):
103 |     if request.stream:
104 |         chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict())  # type: ignore
105 |         return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
106 |     return llama(
107 |         **request.dict(
108 |             exclude={
109 |                 "model",
110 |                 "n",
111 |                 "logprobs",
112 |                 "frequency_penalty",
113 |                 "presence_penalty",
114 |                 "best_of",
115 |                 "logit_bias",
116 |                 "user",
117 |             }
118 |         )
119 |     )
120 | 
121 | 
122 | class CreateEmbeddingRequest(BaseModel):
123 |     model: Optional[str]
124 |     input: str
125 |     user: Optional[str]
126 | 
127 |     class Config:
128 |         schema_extra = {
129 |             "example": {
130 |                 "input": "The food was delicious and the waiter...",
131 |             }
132 |         }
133 | 
134 | 
135 | CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
136 | 
137 | 
138 | @app.post(
139 |     "/v1/embeddings",
140 |     response_model=CreateEmbeddingResponse,
141 | )
142 | def create_embedding(request: CreateEmbeddingRequest):
143 |     return llama.create_embedding(**request.dict(exclude={"model", "user"}))
144 | 
145 | 
146 | class ChatCompletionRequestMessage(BaseModel):
147 |     role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
148 |     content: str
149 |     user: Optional[str] = None
150 | 
151 | 
152 | class CreateChatCompletionRequest(BaseModel):
153 |     model: Optional[str]
154 |     messages: List[ChatCompletionRequestMessage]
155 |     temperature: float = 0.8
156 |     top_p: float = 0.95
157 |     stream: bool = False
158 |     stop: List[str] = []
159 |     max_tokens: int = 128
160 | 
161 |     # ignored or currently unsupported
162 |     model: Optional[str] = Field(None)
163 |     n: Optional[int] = 1
164 |     presence_penalty: Optional[float] = 0
165 |     frequency_penalty: Optional[float] = 0
166 |     logit_bias: Optional[Dict[str, float]] = Field(None)
167 |     user: Optional[str] = Field(None)
168 | 
169 |     # llama.cpp specific parameters
170 |     repeat_penalty: float = 1.1
171 | 
172 |     class Config:
173 |         schema_extra = {
174 |             "example": {
175 |                 "messages": [
176 |                     ChatCompletionRequestMessage(
177 |                         role="system", content="You are a helpful assistant."
178 |                     ),
179 |                     ChatCompletionRequestMessage(
180 |                         role="user", content="What is the capital of France?"
181 |                     ),
182 |                 ]
183 |             }
184 |         }
185 | 
186 | 
187 | CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
188 | 
189 | 
190 | @app.post(
191 |     "/v1/chat/completions",
192 |     response_model=CreateChatCompletionResponse,
193 | )
194 | async def create_chat_completion(
195 |     request: CreateChatCompletionRequest,
196 | ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
197 |     completion_or_chunks = llama.create_chat_completion(
198 |         **request.dict(
199 |             exclude={
200 |                 "model",
201 |                 "n",
202 |                 "presence_penalty",
203 |                 "frequency_penalty",
204 |                 "logit_bias",
205 |                 "user",
206 |             }
207 |         ),
208 |     )
209 | 
210 |     if request.stream:
211 | 
212 |         async def server_sent_events(
213 |             chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
214 |         ):
215 |             for chat_chunk in chat_chunks:
216 |                 yield dict(data=json.dumps(chat_chunk))
217 |             yield dict(data="[DONE]")
218 | 
219 |         chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore
220 | 
221 |         return EventSourceResponse(
222 |             server_sent_events(chunks),
223 |         )
224 |     completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
225 |     return completion
226 | 
227 | 
228 | class ModelData(TypedDict):
229 |     id: str
230 |     object: Literal["model"]
231 |     owned_by: str
232 |     permissions: List[str]
233 | 
234 | 
235 | class ModelList(TypedDict):
236 |     object: Literal["list"]
237 |     data: List[ModelData]
238 | 
239 | 
240 | GetModelResponse = create_model_from_typeddict(ModelList)
241 | 
242 | 
243 | @app.get("/v1/models", response_model=GetModelResponse)
244 | def get_models() -> ModelList:
245 |     return {
246 |         "object": "list",
247 |         "data": [
248 |             {
249 |                 "id": llama.model_path,
250 |                 "object": "model",
251 |                 "owned_by": "me",
252 |                 "permissions": [],
253 |             }
254 |         ],
255 |     }
256 | 
257 | 
258 | if __name__ == "__main__":
259 |     import os
260 |     import uvicorn
261 | 
262 |     uvicorn.run(app, host=os.getenv("HOST", "localhost"), port=os.getenv("PORT", 8000))
263 | 


--------------------------------------------------------------------------------
/app/high_level_api_embedding.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from llama_cpp import Llama
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("-m", "--model", type=str, default=".//models/...")
 7 | args = parser.parse_args()
 8 | 
 9 | llm = Llama(model_path=args.model, embedding=True)
10 | 
11 | print(llm.create_embedding("Hello world!"))
12 | 


--------------------------------------------------------------------------------
/app/high_level_api_inference.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | from llama_cpp import Llama
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("-m", "--model", type=str, default="./models/...")
 8 | args = parser.parse_args()
 9 | 
10 | llm = Llama(model_path=args.model)
11 | 
12 | output = llm(
13 |     "Question: What are the names of the planets in the solar system? Answer: ",
14 |     max_tokens=48,
15 |     stop=["Q:", "\n"],
16 |     echo=True,
17 | )
18 | 
19 | print(json.dumps(output, indent=2))
20 | 


--------------------------------------------------------------------------------
/app/high_level_api_streaming.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | from llama_cpp import Llama
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("-m", "--model", type=str, default="./models/...")
 8 | args = parser.parse_args()
 9 | 
10 | llm = Llama(model_path=args.model)
11 | 
12 | stream = llm(
13 |     "Question: What are the names of the planets in the solar system? Answer: ",
14 |     max_tokens=48,
15 |     stop=["Q:", "\n"],
16 |     stream=True,
17 | )
18 | 
19 | for output in stream:
20 |     print(json.dumps(output, indent=2))
21 | 


--------------------------------------------------------------------------------
/app/langchain_custom_llm.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from llama_cpp import Llama
 4 | 
 5 | from langchain.llms.base import LLM
 6 | from typing import Optional, List, Mapping, Any
 7 | 
 8 | 
 9 | class LlamaLLM(LLM):
10 |     model_path: str
11 |     llm: Llama
12 | 
13 |     @property
14 |     def _llm_type(self) -> str:
15 |         return "llama-cpp-python"
16 | 
17 |     def __init__(self, model_path: str, **kwargs: Any):
18 |         model_path = model_path
19 |         llm = Llama(model_path=model_path)
20 |         super().__init__(model_path=model_path, llm=llm, **kwargs)
21 | 
22 |     def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
23 |         response = self.llm(prompt, stop=stop or [])
24 |         return response["choices"][0]["text"]
25 | 
26 |     @property
27 |     def _identifying_params(self) -> Mapping[str, Any]:
28 |         return {"model_path": self.model_path}
29 | 
30 | 
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument("-m", "--model", type=str, default="./models/...")
33 | args = parser.parse_args()
34 | 
35 | # Load the model
36 | llm = LlamaLLM(model_path=args.model)
37 | 
38 | # Basic Q&A
39 | answer = llm(
40 |     "Question: What is the capital of France? Answer: ", stop=["Question:", "\n"]
41 | )
42 | print(f"Answer: {answer.strip()}")
43 | 
44 | # Using in a chain
45 | from langchain.prompts import PromptTemplate
46 | from langchain.chains import LLMChain
47 | 
48 | prompt = PromptTemplate(
49 |     input_variables=["product"],
50 |     template="\n\n### Instruction:\nWrite a good name for a company that makes {product}\n\n### Response:\n",
51 | )
52 | chain = LLMChain(llm=llm, prompt=prompt)
53 | 
54 | # Run the chain only specifying the input variable.
55 | print(chain.run("colorful socks"))
56 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.6'
 2 | 
 3 | services:
 4 | 
 5 |   chatgpt:
 6 |     image: ghcr.io/mckaywrigley/chatbot-ui:main
 7 |     platform: linux/amd64
 8 |     ports:
 9 |       - 3000:3000
10 |     environment:
11 |       - 'OPENAI_API_KEY=000000000000000'
12 |       - 'OPENAI_API_HOST=http://api:8000'
13 | 
14 |   api:
15 |     build: .
16 |     volumes:
17 |       - ./models:/models
18 |     ports:
19 |       - 8000:8000
20 |     environment:
21 |       - MODEL=/models/7B/gpt4all-lora-quantized.ggml
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | llama-cpp-python
2 | uvicorn
3 | fastapi
4 | sse_starlette
5 | typing_extensions


--------------------------------------------------------------------------------