├── .gitignore
├── CMakeLists.txt
├── Dockerfile
├── LICENSE.md
├── Makefile
├── README.md
├── __init__.py
├── configs
└── config.ini
├── core.py
├── example
└── index.html
├── main.py
├── requirements.txt
├── src
└── quantize.py
├── style.css
└── webui.py
/.gitignore:
--------------------------------------------------------------------------------
1 | #ignore downloaded models
2 | models/models--*
3 | models/.locks
4 | models/tmp*
5 | configs/config.ini
6 |
7 | src/original_model/*
8 | src/quantized_model/*
9 |
10 | src/llama_cpp
11 |
12 | #ignore build folder(for cmake)
13 | build
14 |
15 | #compiled files
16 | *.pyc
17 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Minimum required CMake version
2 | cmake_minimum_required(VERSION 3.15)
3 |
4 | # Project name
5 | project(llama_cpp)
6 |
7 | # Git repository location
8 | set(REPO_URL "https://github.com/ggerganov/llama.cpp")
9 |
10 | # Requirements file
11 | set(REQUIREMENTS_FILE "requirements.txt")
12 |
13 | # Llama directory
14 | set(LLAMA_DIR "${PROJECT_SOURCE_DIR}/src/llama_cpp")
15 |
16 | # Check for Python and Git using CMake's FIND_PACKAGE
17 | find_package(PythonLibs REQUIRED)
18 | find_package(Git REQUIRED)
19 |
20 | # Download and clone the llama.cpp repository
21 | execute_process(
22 | COMMAND git clone ${REPO_URL} ${LLAMA_DIR}
23 | RESULT_VARIABLE git_result
24 | )
25 |
26 | # Error handling for Git clone
27 | if(NOT ${git_result} EQUAL 0)
28 | message(FATAL_ERROR "Failed to clone llama.cpp repository")
29 | endif()
30 |
31 | # Install Python requirements
32 | execute_process(
33 | COMMAND pip install -r "${LLAMA_DIR}/${REQUIREMENTS_FILE}"
34 | )
35 |
36 | file(MAKE_DIRECTORY "${PROJECT_SOURCE_DIR}/src/quantized_model")
37 |
38 | find_program(PYTHON NAMES python python3 2>/dev/null)
39 |
40 | if(PYTHON)
41 | file(APPEND "${PROJECT_SOURCE_DIR}/configs/config.ini" "py_cmd = ${PYTHON}")
42 | endif()
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Step 1: Use an official base image
2 | FROM python:3.11.9-slim
3 |
4 | # Step 2: Install C++ compiler and other dependencies
5 | RUN apt-get update && apt-get install -y \
6 | g++ \
7 | make \
8 | git \
9 | && apt-get clean
10 |
11 | # Step 3: Set the working directory
12 | WORKDIR /app
13 |
14 | # Step 4: Copy the source code into the container
15 | COPY . /app
16 |
17 | # Step 5: Install Python dependencies
18 | RUN pip install --no-cache-dir -r requirements.txt
19 |
20 | # Step 6: Run makefile to build LLMinator
21 | RUN make
22 |
23 | # Step 7: Launch LLMinator
24 | CMD ["python3","webui.py"]
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Aesthisia Datacenters
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile to clone llama.cpp repository and install requirements
2 |
3 | # Variables
4 | REPO_URL := https://github.com/ggerganov/llama.cpp
5 | REQUIREMENTS_FILE := requirements.txt
6 | LLAMA_DIR := src/llama_cpp
7 |
8 | # Determine pip command
9 | PIP := $(shell command -v pip3 2>/dev/null || command -v pip)
10 |
11 | # Check if python and git are installed
12 | PYTHON := $(shell command -v python 2>/dev/null || command -v python3 2>/dev/null)
13 | GIT := $(shell command -v git)
14 |
15 | ifeq ($(PYTHON),)
16 | $(error Python is not installed. Please install Python before running this Makefile.)
17 | endif
18 |
19 | ifeq ($(GIT),)
20 | $(error Git is not installed. Please install Git before running this Makefile.)
21 | endif
22 |
23 | # Targets
24 | .PHONY: all clone install clean quantized_model_dir append_to_configs
25 |
26 | all: clone install quantized_model_dir append_to_configs
27 |
28 | clone:
29 | mkdir -p $(LLAMA_DIR)
30 | git clone $(REPO_URL) $(LLAMA_DIR)
31 |
32 | install:
33 | cd $(LLAMA_DIR) && \
34 | $(PIP) install -r $(REQUIREMENTS_FILE)
35 |
36 | quantized_model_dir:
37 | mkdir -p src/quantized_model
38 |
39 | append_to_configs:
40 | echo "py_cmd = $(PYTHON)" >> configs/config.ini
41 |
42 | clean:
43 | rm -rf $(LLAMA_DIR)
44 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## LLMinator: Run & Test LLMs directly from HuggingFace
2 |
3 | #### Gradio based tool with integrated chatbot to locally run & test LLMs directly from HuggingFace.
4 |
5 | An easy-to-use tool made with Gradio, LangChain, and Torch.
6 |
7 | 
8 |
9 | 
10 |
11 | ### ⚡ Features
12 |
13 | - Context-aware Streaming Chatbot.
14 | - Inbuilt code syntax highlighting.
15 | - Load any LLM repo directly from HuggingFace.
16 | - Supports both CPU & CUDA modes.
17 | - Enable LLM inference with [llama.cpp](https://github.com/ggerganov/llama.cpp) using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
18 | - Convert models(Safetensors, pt to gguf etc)
19 | - Customize LLM inference parameters(n_gpu_layers, temperature, max_tokens etc)
20 | - Real-time text generation via websockets, enabling seamless integration with different frontend frameworks.
21 |
22 | ## 🚀 Installation
23 |
24 | To use LLMinator, follow these simple steps:
25 |
26 | #### Clone the LLMinator repository from GitHub & install requirements
27 |
28 | ```
29 | git clone https://github.com/Aesthisia/LLMinator.git
30 | cd LLMinator
31 | pip install -r requirements.txt
32 | ```
33 |
34 | #### Build LLMinator with [llama.cpp](https://github.com/ggerganov/llama.cpp):
35 |
36 | - Using `make`:
37 |
38 | - On Linux or MacOS:
39 |
40 | ```bash
41 | make
42 | ```
43 |
44 | - On Windows:
45 |
46 | 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
47 | 2. Extract `w64devkit` on your pc.
48 | 3. Run `w64devkit.exe`.
49 | 4. Use the `cd` command to reach the `LLMinator` folder.
50 | 5. From here you can run:
51 | ```bash
52 | make
53 | ```
54 |
55 | - Using `CMake`:
56 | ```bash
57 | mkdir build
58 | cd build
59 | cmake ..
60 | ```
61 |
62 | #### Launch LLMinator on browser
63 |
64 | - Run the LLMinator tool using the command `python webui.py`.
65 | - Access the web interface by opening the [http://127.0.0.1:7860](http://127.0.0.1:7860) in your browser.
66 | - Start interacting with the chatbot and experimenting with LLMs!
67 |
68 | #### [
](#)
69 |
70 | Checkout this youtube [video](https://www.youtube.com/watch?v=OL8wRYbdjLE) to follow installation steps
71 |
72 | ### Command line arguments
73 |
74 | | Argument Command | Default | Description |
75 | | ---------------- | --------- | --------------------------------------------------------------------------- |
76 | | --host | 127.0.0.1 | Host or IP address on which the server will listen for incoming connections |
77 | | --port | 7860 | Launch gradio with given server port |
78 | | --share | False | This generates a public shareable link that you can send to anybody |
79 |
80 | ### Connect to WebSocket for generation
81 |
82 | Connect to [ws://localhost:7861/](ws://localhost:7861/) for real-time text generation. Submit prompts and receive responses through the websocket connection.
83 |
84 | **Integration with Frontends:**
85 |
86 | The provided `example/index.html` demonstrates basic usage of text generation through websocket connection. You can integrate it with any frontend framework like React.js
87 |
88 | ## Installation and Development Tips
89 |
90 | #### Python Version
91 |
92 | - **Compatible Versions:** This project is compatible with Python versions 3.8+ to 3.11. Ensure you have one of these versions installed on your system. You can check your Python version by running `python --version` or `python3 --version` in your terminal.
93 |
94 | #### Cmake and C Compiler
95 |
96 | - **Cmake Dependency:** If you plan to build the project using Cmake, make sure you have Cmake installed.
97 | - **C Compiler:** Additionally, you'll need a C compiler such as GCC. These are typically included with most Linux distributions. You can check this by running `gcc --version` in your terminal. Installation instructions for your specific operating system can be found online.
98 |
99 | #### Visual Studio Code
100 |
101 | - **Visual Studio Installer:** If you're using Visual Studio Code for development, you'll need the C++ development workload installed. You can achieve this through the [Visual Studio Installer](https://visualstudio.microsoft.com/vs/features/cplusplus/)
102 |
103 | #### GPU Acceleration (CUDA):
104 |
105 | - **CUDA Installation:** To leverage GPU acceleration, you'll need CUDA installed on your system. Download instructions are available on the [NVIDIA website](https://developer.nvidia.com/cuda-toolkit).
106 | - **Torch Compatibility:** After installing CUDA, confirm CUDA availability with `torch.cuda.is_available()`. When using a GPU, ensure you follow the project's specific `llama-cpp-python` installation configuration for CUDA support.
107 |
108 | ## Reporting Issues:
109 |
110 | If you encounter any errors or issues, feel free to file a detailed report in the project's repository. We're always happy to help! When reporting an issue, please provide as much information as possible, including the error message, logs, the steps you took, and your system configuration. This makes it easier for us to diagnose and fix the problem quickly.
111 |
112 | ## 🤝 Contributions
113 |
114 | We welcome contributions from the community to enhance LLMinator further. If you'd like to contribute, please follow these guidelines:
115 |
116 | - Fork the LLMinator repository on GitHub.
117 | - Create a new branch for your feature or bug fix.
118 | - Test your changes thoroughly.
119 | - Submit a pull request, providing a clear description of the changes you've made.
120 |
121 | Reach out to us: info@aesthisia.com
122 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/configs/config.ini:
--------------------------------------------------------------------------------
1 | [Settings]
2 | execution_provider =
3 | repo_id =
4 |
--------------------------------------------------------------------------------
/core.py:
--------------------------------------------------------------------------------
1 | import os, shutil
2 | from configparser import ConfigParser
3 | import gradio as gr
4 |
5 | default_repo_id = "stabilityai/stable-code-instruct-3b"
6 | config_path = "configs/config.ini"
7 | cache_gguf_dir = os.path.join(os.getcwd(), "src/quantized_model")
8 | cache_original_dir = os.path.join(os.getcwd(), "src/original_model")
9 |
10 | def format_gguf_model_name(file_name):
11 | parts = file_name.replace('.gguf', '').split("__")
12 | return "/".join(parts)
13 |
14 | def list_converted_gguf_models(cache_gguf_dir):
15 | contents = os.listdir(cache_gguf_dir)
16 | model_files = [format_gguf_model_name(item) for item in contents]
17 | return model_files
18 |
19 | def removeModelFromCache(model_name):
20 | config = ConfigParser()
21 | config.read(config_path)
22 | repo_id = config.get('Settings', 'repo_id')
23 | if model_name == repo_id:
24 | raise gr.Error("Can not delete default model")
25 | else:
26 | gguf_model_name = model_name.replace("/", "__") + ".gguf"
27 | original_model_parts = model_name.split("/")
28 | original_model_name = f"model--{'--'.join(original_model_parts)}"
29 | try:
30 | os.remove(os.path.join(cache_gguf_dir, gguf_model_name))
31 | shutil.rmtree(os.path.join(cache_original_dir, original_model_name))
32 | except FileNotFoundError:
33 | raise gr.Error("Model not found in cache.")
34 |
35 | def read_config():
36 | config = ConfigParser()
37 | config.read(config_path)
38 | if config.get('Settings', 'repo_id') == "" and config.get('Settings', 'execution_provider') == "":
39 | return None, config
40 | else:
41 | return config, config
42 |
43 | def update_config(config, **kwargs):
44 | for key, value in kwargs.items():
45 | config.set('Settings', key, value)
46 | with open(config_path, 'w') as configfile:
47 | config.write(configfile)
--------------------------------------------------------------------------------
/example/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | LLMinator
7 |
8 |
9 | LLMinator
10 |
14 |
15 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from fastapi import FastAPI, WebSocket
3 | from src import quantize
4 | from langchain import PromptTemplate
5 | from langchain_community.llms import LlamaCpp
6 | from langchain.callbacks.manager import CallbackManager
7 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
8 | from langchain_core.prompts import PromptTemplate
9 | from core import default_repo_id
10 |
11 | app = FastAPI()
12 |
13 | # Callbacks support token-wise streaming
14 | callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
15 |
16 | #check if cuda is available
17 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
18 |
19 | n_gpu_layers = None
20 | if device == "cuda":
21 | n_gpu_layers = -1
22 | else:
23 | n_gpu_layers = 0
24 | n_ctx = 6000
25 | n_batch = 30
26 | n_parts = 1
27 | temperature = 0.9
28 | max_tokens = 500
29 |
30 | def snapshot_download_and_convert_to_gguf(repo_id):
31 | gguf_model_path = quantize.quantize_model(repo_id)
32 | return gguf_model_path
33 |
34 | def init_llm_chain(model_path):
35 | llm = LlamaCpp(
36 | model_path=model_path,
37 | n_gpu_layers=n_gpu_layers,
38 | n_ctx=n_ctx,
39 | n_batch=n_batch,
40 | temperature=temperature,
41 | max_tokens=max_tokens,
42 | n_parts=n_parts,
43 | callback_manager=callback_manager,
44 | verbose=True
45 | )
46 |
47 | template = """Question: {question}
48 | Answer: Let's work this out in a step by step way to be sure we have the right answer."""
49 |
50 | prompt = PromptTemplate.from_template(template)
51 | llm_chain = prompt | llm
52 | return llm_chain, llm
53 |
54 | model_path = snapshot_download_and_convert_to_gguf(default_repo_id)
55 |
56 | llm_chain, llm = init_llm_chain(model_path)
57 |
58 | @app.websocket("/generate")
59 | async def websocket_endpoint(websocket: WebSocket):
60 | await websocket.accept()
61 | while True:
62 | prompt = await websocket.receive_text()
63 |
64 | async def bot(prompt):
65 | print("Question: ", prompt)
66 | output = llm_chain.stream(prompt)
67 | print("stream:", output)
68 | for character in output:
69 | print(character)
70 | await websocket.send_text(character)
71 |
72 | await bot(prompt)
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gradio==4.27.0
2 | huggingface_hub==0.21.1
3 | langchain==0.1.14
4 | torch==2.1.2
5 | llama-cpp-python==0.2.76
6 | fastapi==0.110.1
--------------------------------------------------------------------------------
/src/quantize.py:
--------------------------------------------------------------------------------
1 | import subprocess, os
2 | from huggingface_hub import snapshot_download
3 | from configparser import ConfigParser
4 |
5 | config_path = "./configs/config.ini"
6 |
7 | def get_py_cmd():
8 | config = ConfigParser()
9 | config.read(config_path)
10 | py_cmd = config.get('Settings', 'py_cmd')
11 | if "python3" in py_cmd:
12 | return 'python3'
13 | else:
14 | return 'python'
15 |
16 | def quantize_model(repo_id):
17 | original_models_path = "./src/original_model/"
18 | quantized_path = "./src/quantized_model/"
19 |
20 | repo_id_parts = repo_id.split("/")
21 | model_folder = f"model--{'--'.join(repo_id_parts)}"
22 | model_path = original_models_path + model_folder
23 |
24 | outfile = quantized_path + repo_id.replace("/", "__") + ".gguf"
25 |
26 | if os.path.isfile(outfile):
27 | return outfile
28 |
29 | snapshot_download(repo_id=repo_id, local_dir=model_path , local_dir_use_symlinks=True)
30 |
31 | command = [
32 | get_py_cmd(),
33 | './src/llama_cpp/convert-hf-to-gguf.py',
34 | model_path,
35 | '--outtype', 'f16',
36 | '--outfile', outfile
37 | ]
38 |
39 | subprocess.run(command, check=True)
40 |
41 | return outfile
42 |
--------------------------------------------------------------------------------
/style.css:
--------------------------------------------------------------------------------
1 | #chatbot-container {
2 | min-height: calc(100vh - 200px);
3 | }
4 |
5 | #title-container {
6 | max-height: 50px;
7 | }
8 |
9 | #configs-container {
10 | max-width: 700px;
11 | }
--------------------------------------------------------------------------------
/webui.py:
--------------------------------------------------------------------------------
1 | import os, torch, argparse, asyncio, websockets, threading
2 | import gradio as gr
3 | from src import quantize
4 | from langchain import PromptTemplate
5 | from langchain_community.llms import LlamaCpp
6 | from langchain.callbacks.manager import CallbackManager
7 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
8 | from langchain_core.prompts import PromptTemplate
9 | from core import list_converted_gguf_models, default_repo_id, read_config, update_config, removeModelFromCache
10 | import sys
11 |
12 | sys.path.append('./src/llama_cpp/')
13 | sys.path.append('./src/')
14 |
15 | cache_gguf_dir = os.path.join(os.getcwd(), "src/quantized_model")
16 |
17 | # Callbacks support token-wise streaming
18 | callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
19 |
20 | #check if cuda is available
21 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
22 |
23 | state, config = read_config()
24 | if state == None:
25 | config.set('Settings', 'execution_provider', device)
26 | config.set('Settings', 'repo_id', default_repo_id)
27 | update_config(config)
28 | else:
29 | default_repo_id = config.get('Settings', 'repo_id')
30 | device = config.get('Settings', 'execution_provider')
31 |
32 | def snapshot_download_and_convert_to_gguf(repo_id):
33 | gguf_model_path = quantize.quantize_model(repo_id)
34 | return gguf_model_path
35 |
36 | n_gpu_layers = None
37 | if device == "cuda":
38 | n_gpu_layers = -1
39 | else:
40 | n_gpu_layers = 0
41 | n_ctx = 6000
42 | n_batch = 30
43 | n_parts = 1
44 | temperature = 0.9
45 | max_tokens = 4095
46 |
47 | def init_llm_chain(model_path):
48 | llm = LlamaCpp(
49 | model_path=model_path,
50 | n_gpu_layers=n_gpu_layers,
51 | n_ctx=n_ctx,
52 | n_batch=n_batch,
53 | temperature=temperature,
54 | max_tokens=max_tokens,
55 | n_parts=n_parts,
56 | callback_manager=callback_manager,
57 | verbose=True
58 | )
59 |
60 | template = """Question: {question}
61 | Answer: Let's work this out in a step by step way to be sure we have the right answer."""
62 |
63 | prompt = PromptTemplate.from_template(template)
64 | llm_chain = prompt | llm
65 | return llm_chain, llm
66 |
67 | def parse_args():
68 | parser = argparse.ArgumentParser(description='Optional arguments for --host & --port.')
69 | parser.add_argument('--host', type=str, default='0.0.0.0', help='The host IP to run the server on.')
70 | parser.add_argument('--port', type=int, default=7860, help='The port to run the server on.')
71 | parser.add_argument('--share', type=bool, default=False, help='To create a public link.')
72 | return parser.parse_args()
73 |
74 | args = parse_args()
75 |
76 | model_path = snapshot_download_and_convert_to_gguf(default_repo_id)
77 |
78 | async def generate(websocket):
79 | async for message in websocket:
80 | output = llm_chain.stream(message)
81 | for character in output:
82 | await asyncio.sleep(0)
83 | await websocket.send(character)
84 |
85 | async def start_websockets():
86 | print(f"Starting WebSocket server on port 7861 ...")
87 | async with websockets.serve(generate, "localhost", 7861):
88 | await asyncio.Future()
89 |
90 | async def main():
91 | await start_websockets()
92 |
93 | with gr.Blocks(css='style.css') as demo:
94 | with gr.Tabs(selected="chat") as tabs:
95 | with gr.Tab("Chat", id="chat"):
96 | with gr.Row():
97 | with gr.Column(scale=1):
98 | title = gr.Button(
99 | value="LLMinator",
100 | scale=1,
101 | variant="primary",
102 | interactive=True,
103 | elem_id="title-container"
104 | )
105 | converted_models_chat = gr.Dropdown(
106 | choices=list_converted_gguf_models(cache_gguf_dir),
107 | value=default_repo_id,
108 | max_choices=5,
109 | filterable=True,
110 | info="Default: stabilityai/stable-code-instruct-3b",
111 | label="Selected Model",
112 | )
113 | with gr.Group():
114 | execution_provider = gr.Radio(
115 | ["cuda", "cpu"],
116 | value=device,
117 | label="Execution providers",
118 | info="Select Device"
119 | )
120 |
121 | with gr.Column(scale=4):
122 | with gr.Group():
123 | chatbot = gr.Chatbot(elem_id="chatbot-container")
124 | msg = gr.Textbox(label="Prompt")
125 | stop = gr.Button("Stop")
126 |
127 | with gr.Tab("Models", id="models"):
128 | with gr.Row():
129 | with gr.Column():
130 | with gr.Group():
131 | model_repo_id = gr.Textbox(
132 | value="",
133 | label="Hugging Face Repo",
134 | info="Default: stabilityai/stable-code-instruct-3b",
135 | interactive=True
136 | )
137 | format_choice = gr.Dropdown(
138 | choices=["gguf"],
139 | value="gguf",
140 | label="Convert Format",
141 | interactive=True
142 | )
143 | download_convert_btn = gr.Button(
144 | value="Download Snapshot & Convert",
145 | variant="secondary",
146 | interactive=True
147 | )
148 | with gr.Row():
149 | with gr.Group():
150 | converted_models = gr.Dropdown(
151 | choices=list_converted_gguf_models(cache_gguf_dir),
152 | value=default_repo_id,
153 | max_choices=5,
154 | filterable=True,
155 | info="gguf models available in the disk",
156 | label="Converted Models",
157 | interactive=True
158 | )
159 | send_to_chat_btn = gr.Button(
160 | value="Send to Chat",
161 | variant="secondary",
162 | interactive=True
163 | )
164 |
165 | with gr.Group():
166 | saved_gguf_models = gr.Dropdown(
167 | choices=list_converted_gguf_models(cache_gguf_dir),
168 | max_choices=5,
169 | filterable=True,
170 | info="gguf models available in the disk",
171 | label="Remove Models",
172 | interactive=True
173 | )
174 | remove_model_btn = gr.Button(
175 | value="Remove Model",
176 | variant="danger",
177 | interactive=True
178 | )
179 | with gr.Tab("Configs", id="configs"):
180 | with gr.Row():
181 | with gr.Column(elem_id="configs-container"):
182 | n_gpu_layers_input = gr.Slider(0, 5000, value=5000, step=1, label="n_gpu_layers", visible=torch.cuda.is_available(), interactive= True)
183 | n_ctx_input = gr.Slider(100, 6000, value=6000, label="n_ctx", interactive= True)
184 | n_batch_input = gr.Slider(1, 512, value=30, label="n_batch", visible=torch.cuda.is_available(), interactive= True)
185 | n_parts_input = gr.Slider(1, 10, step=1, value=1, label="n_parts", interactive= True)
186 | temperature_input = gr.Slider(0.1, 1, step=0.1, value=0.9, label="temperature", interactive= True)
187 | max_tokens_input = gr.Slider(1, 4095, value=4095, label="max_tokens", interactive= True)
188 |
189 | with gr.Row():
190 | config_update_btn = gr.Button(
191 | value="Update Configs",
192 | variant="primary",
193 | interactive=True
194 | )
195 |
196 | config_reset_btn = gr.Button(
197 | value="Reset Configs",
198 | variant="primary",
199 | interactive=True
200 | )
201 |
202 | llm_chain, llm = init_llm_chain(model_path)
203 |
204 | def updateConfigs(n_gpu_layers_input, n_ctx_input, n_batch_input, n_parts_input, temperature_input, max_tokens_input):
205 | global n_gpu_layers, n_ctx, n_batch, n_parts, temperature, max_tokens, llm_chain, llm
206 | if torch.cuda.is_available():
207 | n_gpu_layers = n_gpu_layers_input
208 | if n_batch_input < n_ctx_input:
209 | n_batch = n_batch_input
210 | else:
211 | raise gr.Error("n_batch should be between 1 and n_ctx")
212 | else:
213 | n_gpu_layers = 0
214 | n_ctx = 30
215 |
216 | n_ctx = n_ctx_input
217 | n_parts = n_parts_input
218 | temperature = temperature_input
219 | max_tokens = max_tokens_input
220 | llm_chain, llm = init_llm_chain(model_path)
221 | return gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.Tabs(selected="chat")
222 |
223 | def resetConfigs():
224 | global n_gpu_layers, n_ctx, n_batch, n_parts, temperature, max_tokens, llm_chain, llm
225 | n_gpu_layers = 0
226 | n_ctx = 6000
227 | n_batch = 30
228 | n_parts = 1
229 | temperature = 0.9
230 | max_tokens = 4095
231 | llm_chain, llm = init_llm_chain(model_path)
232 | return gr.update(value="0"), gr.update(value="6000"), gr.update(value="30"), gr.update(value="1"), gr.update(value="0.9"), gr.update(value="4095")
233 |
234 | def updateExecutionProvider(provider, gguf_model):
235 | global device
236 | if provider == "cuda":
237 | if torch.cuda.is_available():
238 | device = "cuda"
239 | else:
240 | raise gr.Error("Torch not compiled with CUDA enabled. Please make sure cuda is installed.")
241 |
242 | else:
243 | device = "cpu"
244 |
245 | update_config(config, execution_provider=provider)
246 | loadModel(gguf_model)
247 | return gr.update(value=device)
248 |
249 | def removeModel(model_name):
250 | removeModelFromCache(model_name)
251 | return gr.update(choices=list_converted_gguf_models(cache_gguf_dir)), gr.update(choices=list_converted_gguf_models(cache_gguf_dir)), gr.update(choices=list_converted_gguf_models(cache_gguf_dir))
252 |
253 | def user(user_message, history):
254 | return "", history + [[user_message, None]]
255 |
256 | def downloadConvertModel(model_repo_id):
257 | if model_repo_id:
258 | snapshot_download_and_convert_to_gguf(model_repo_id)
259 | return gr.update(value=""), gr.update(choices=list_converted_gguf_models(cache_gguf_dir)), gr.update(choices=list_converted_gguf_models(cache_gguf_dir)), gr.update(choices=list_converted_gguf_models(cache_gguf_dir))
260 | else:
261 | raise gr.Error("Repo can not be empty!")
262 |
263 | def loadModel(repo_id):
264 | global llm_chain, llm
265 | model_path = snapshot_download_and_convert_to_gguf(repo_id)
266 | llm_chain, llm = init_llm_chain(model_path)
267 | update_config(config, repo_id=repo_id)
268 |
269 | def loadModelFromModelsTab(model_repo_id):
270 | loadModel(model_repo_id)
271 | return gr.update(value=model_repo_id), gr.Tabs(selected="chat")
272 |
273 | def loadModelFromChatTab(repo_id):
274 | loadModel(repo_id)
275 | return gr.update(value=repo_id)
276 |
277 | def bot(history):
278 | print("Question: ", history[-1][0])
279 | output = llm_chain.stream(history[-1][0])
280 | print("stream:", output)
281 | history[-1][1] = ""
282 | for character in output:
283 | # print(character)
284 | history[-1][1] += character
285 | yield history
286 |
287 | submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
288 | # stop.click(None, None, None, cancels=[submit_event], queue=False)
289 | download_convert_btn.click(downloadConvertModel, model_repo_id, [model_repo_id, converted_models_chat, converted_models, saved_gguf_models], queue=False, show_progress="full")
290 | send_to_chat_btn.click(loadModelFromModelsTab, converted_models, [converted_models_chat, tabs], queue=False, show_progress="full")
291 | converted_models_chat.change(loadModelFromChatTab, converted_models_chat, converted_models_chat, queue=False, show_progress="full")
292 | remove_model_btn.click(removeModel, saved_gguf_models, [saved_gguf_models, converted_models_chat, converted_models], queue=False, show_progress="full")
293 | execution_provider.change(updateExecutionProvider, [execution_provider, converted_models_chat], execution_provider, queue=False, show_progress="full")
294 | config_update_btn.click(updateConfigs, [n_gpu_layers_input, n_ctx_input, n_batch_input, n_parts_input, temperature_input, max_tokens_input], [n_gpu_layers_input, n_ctx_input, n_batch_input, n_parts_input, temperature_input, max_tokens_input, tabs], queue=False, show_progress="full")
295 | config_reset_btn.click(resetConfigs, None, [n_gpu_layers_input, n_ctx_input, n_batch_input, n_parts_input, temperature_input, max_tokens_input], queue=False, show_progress="full")
296 |
297 | demo.queue()
298 |
299 | def launch_demo():
300 | demo.launch(server_name=args.host, server_port=args.port, share=args.share)
301 |
302 | if __name__ == "__main__":
303 | threading.Thread(target=launch_demo).start()
304 | asyncio.run(main())
--------------------------------------------------------------------------------