├── .gitignore ├── CMakeLists.txt ├── Dockerfile ├── LICENSE.md ├── Makefile ├── README.md ├── __init__.py ├── configs └── config.ini ├── core.py ├── example └── index.html ├── main.py ├── requirements.txt ├── src └── quantize.py ├── style.css └── webui.py /.gitignore: -------------------------------------------------------------------------------- 1 | #ignore downloaded models 2 | models/models--* 3 | models/.locks 4 | models/tmp* 5 | configs/config.ini 6 | 7 | src/original_model/* 8 | src/quantized_model/* 9 | 10 | src/llama_cpp 11 | 12 | #ignore build folder(for cmake) 13 | build 14 | 15 | #compiled files 16 | *.pyc 17 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Minimum required CMake version 2 | cmake_minimum_required(VERSION 3.15) 3 | 4 | # Project name 5 | project(llama_cpp) 6 | 7 | # Git repository location 8 | set(REPO_URL "https://github.com/ggerganov/llama.cpp") 9 | 10 | # Requirements file 11 | set(REQUIREMENTS_FILE "requirements.txt") 12 | 13 | # Llama directory 14 | set(LLAMA_DIR "${PROJECT_SOURCE_DIR}/src/llama_cpp") 15 | 16 | # Check for Python and Git using CMake's FIND_PACKAGE 17 | find_package(PythonLibs REQUIRED) 18 | find_package(Git REQUIRED) 19 | 20 | # Download and clone the llama.cpp repository 21 | execute_process( 22 | COMMAND git clone ${REPO_URL} ${LLAMA_DIR} 23 | RESULT_VARIABLE git_result 24 | ) 25 | 26 | # Error handling for Git clone 27 | if(NOT ${git_result} EQUAL 0) 28 | message(FATAL_ERROR "Failed to clone llama.cpp repository") 29 | endif() 30 | 31 | # Install Python requirements 32 | execute_process( 33 | COMMAND pip install -r "${LLAMA_DIR}/${REQUIREMENTS_FILE}" 34 | ) 35 | 36 | file(MAKE_DIRECTORY "${PROJECT_SOURCE_DIR}/src/quantized_model") 37 | 38 | find_program(PYTHON NAMES python python3 2>/dev/null) 39 | 40 | if(PYTHON) 41 | file(APPEND "${PROJECT_SOURCE_DIR}/configs/config.ini" "py_cmd = ${PYTHON}") 42 | endif() -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Step 1: Use an official base image 2 | FROM python:3.11.9-slim 3 | 4 | # Step 2: Install C++ compiler and other dependencies 5 | RUN apt-get update && apt-get install -y \ 6 | g++ \ 7 | make \ 8 | git \ 9 | && apt-get clean 10 | 11 | # Step 3: Set the working directory 12 | WORKDIR /app 13 | 14 | # Step 4: Copy the source code into the container 15 | COPY . /app 16 | 17 | # Step 5: Install Python dependencies 18 | RUN pip install --no-cache-dir -r requirements.txt 19 | 20 | # Step 6: Run makefile to build LLMinator 21 | RUN make 22 | 23 | # Step 7: Launch LLMinator 24 | CMD ["python3","webui.py"] -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Aesthisia Datacenters 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile to clone llama.cpp repository and install requirements 2 | 3 | # Variables 4 | REPO_URL := https://github.com/ggerganov/llama.cpp 5 | REQUIREMENTS_FILE := requirements.txt 6 | LLAMA_DIR := src/llama_cpp 7 | 8 | # Determine pip command 9 | PIP := $(shell command -v pip3 2>/dev/null || command -v pip) 10 | 11 | # Check if python and git are installed 12 | PYTHON := $(shell command -v python 2>/dev/null || command -v python3 2>/dev/null) 13 | GIT := $(shell command -v git) 14 | 15 | ifeq ($(PYTHON),) 16 | $(error Python is not installed. Please install Python before running this Makefile.) 17 | endif 18 | 19 | ifeq ($(GIT),) 20 | $(error Git is not installed. Please install Git before running this Makefile.) 21 | endif 22 | 23 | # Targets 24 | .PHONY: all clone install clean quantized_model_dir append_to_configs 25 | 26 | all: clone install quantized_model_dir append_to_configs 27 | 28 | clone: 29 | mkdir -p $(LLAMA_DIR) 30 | git clone $(REPO_URL) $(LLAMA_DIR) 31 | 32 | install: 33 | cd $(LLAMA_DIR) && \ 34 | $(PIP) install -r $(REQUIREMENTS_FILE) 35 | 36 | quantized_model_dir: 37 | mkdir -p src/quantized_model 38 | 39 | append_to_configs: 40 | echo "py_cmd = $(PYTHON)" >> configs/config.ini 41 | 42 | clean: 43 | rm -rf $(LLAMA_DIR) 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## LLMinator: Run & Test LLMs directly from HuggingFace 2 | 3 | #### Gradio based tool with integrated chatbot to locally run & test LLMs directly from HuggingFace. 4 | 5 | An easy-to-use tool made with Gradio, LangChain, and Torch. 6 | 7 | ![LLMinator chat tab](https://github.com/Aesthisia/LLMinator/assets/89995648/0c7fd00f-610b-4ad1-8736-1f0cb7d212de) 8 | 9 | ![LLMinator models tab](https://github.com/Aesthisia/LLMinator/assets/89995648/44c03281-fb76-40c6-b1d3-2e395562ae16) 10 | 11 | ### ⚡ Features 12 | 13 | - Context-aware Streaming Chatbot. 14 | - Inbuilt code syntax highlighting. 15 | - Load any LLM repo directly from HuggingFace. 16 | - Supports both CPU & CUDA modes. 17 | - Enable LLM inference with [llama.cpp](https://github.com/ggerganov/llama.cpp) using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) 18 | - Convert models(Safetensors, pt to gguf etc) 19 | - Customize LLM inference parameters(n_gpu_layers, temperature, max_tokens etc) 20 | - Real-time text generation via websockets, enabling seamless integration with different frontend frameworks. 21 | 22 | ## 🚀 Installation 23 | 24 | To use LLMinator, follow these simple steps: 25 | 26 | #### Clone the LLMinator repository from GitHub & install requirements 27 | 28 | ``` 29 | git clone https://github.com/Aesthisia/LLMinator.git 30 | cd LLMinator 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | #### Build LLMinator with [llama.cpp](https://github.com/ggerganov/llama.cpp): 35 | 36 | - Using `make`: 37 | 38 | - On Linux or MacOS: 39 | 40 | ```bash 41 | make 42 | ``` 43 | 44 | - On Windows: 45 | 46 | 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 47 | 2. Extract `w64devkit` on your pc. 48 | 3. Run `w64devkit.exe`. 49 | 4. Use the `cd` command to reach the `LLMinator` folder. 50 | 5. From here you can run: 51 | ```bash 52 | make 53 | ``` 54 | 55 | - Using `CMake`: 56 | ```bash 57 | mkdir build 58 | cd build 59 | cmake .. 60 | ``` 61 | 62 | #### Launch LLMinator on browser 63 | 64 | - Run the LLMinator tool using the command `python webui.py`. 65 | - Access the web interface by opening the [http://127.0.0.1:7860](http://127.0.0.1:7860) in your browser. 66 | - Start interacting with the chatbot and experimenting with LLMs! 67 | 68 | #### [](#) 69 | 70 | Checkout this youtube [video](https://www.youtube.com/watch?v=OL8wRYbdjLE) to follow installation steps 71 | 72 | ### Command line arguments 73 | 74 | | Argument Command | Default | Description | 75 | | ---------------- | --------- | --------------------------------------------------------------------------- | 76 | | --host | 127.0.0.1 | Host or IP address on which the server will listen for incoming connections | 77 | | --port | 7860 | Launch gradio with given server port | 78 | | --share | False | This generates a public shareable link that you can send to anybody | 79 | 80 | ### Connect to WebSocket for generation 81 | 82 | Connect to [ws://localhost:7861/](ws://localhost:7861/) for real-time text generation. Submit prompts and receive responses through the websocket connection. 83 | 84 | **Integration with Frontends:** 85 | 86 | The provided `example/index.html` demonstrates basic usage of text generation through websocket connection. You can integrate it with any frontend framework like React.js 87 | 88 | ## Installation and Development Tips 89 | 90 | #### Python Version 91 | 92 | - **Compatible Versions:** This project is compatible with Python versions 3.8+ to 3.11. Ensure you have one of these versions installed on your system. You can check your Python version by running `python --version` or `python3 --version` in your terminal. 93 | 94 | #### Cmake and C Compiler 95 | 96 | - **Cmake Dependency:** If you plan to build the project using Cmake, make sure you have Cmake installed. 97 | - **C Compiler:** Additionally, you'll need a C compiler such as GCC. These are typically included with most Linux distributions. You can check this by running `gcc --version` in your terminal. Installation instructions for your specific operating system can be found online. 98 | 99 | #### Visual Studio Code 100 | 101 | - **Visual Studio Installer:** If you're using Visual Studio Code for development, you'll need the C++ development workload installed. You can achieve this through the [Visual Studio Installer](https://visualstudio.microsoft.com/vs/features/cplusplus/) 102 | 103 | #### GPU Acceleration (CUDA): 104 | 105 | - **CUDA Installation:** To leverage GPU acceleration, you'll need CUDA installed on your system. Download instructions are available on the [NVIDIA website](https://developer.nvidia.com/cuda-toolkit). 106 | - **Torch Compatibility:** After installing CUDA, confirm CUDA availability with `torch.cuda.is_available()`. When using a GPU, ensure you follow the project's specific `llama-cpp-python` installation configuration for CUDA support. 107 | 108 | ## Reporting Issues: 109 | 110 | If you encounter any errors or issues, feel free to file a detailed report in the project's repository. We're always happy to help! When reporting an issue, please provide as much information as possible, including the error message, logs, the steps you took, and your system configuration. This makes it easier for us to diagnose and fix the problem quickly. 111 | 112 | ## 🤝 Contributions 113 | 114 | We welcome contributions from the community to enhance LLMinator further. If you'd like to contribute, please follow these guidelines: 115 | 116 | - Fork the LLMinator repository on GitHub. 117 | - Create a new branch for your feature or bug fix. 118 | - Test your changes thoroughly. 119 | - Submit a pull request, providing a clear description of the changes you've made. 120 | 121 | Reach out to us: info@aesthisia.com 122 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /configs/config.ini: -------------------------------------------------------------------------------- 1 | [Settings] 2 | execution_provider = 3 | repo_id = 4 | -------------------------------------------------------------------------------- /core.py: -------------------------------------------------------------------------------- 1 | import os, shutil 2 | from configparser import ConfigParser 3 | import gradio as gr 4 | 5 | default_repo_id = "stabilityai/stable-code-instruct-3b" 6 | config_path = "configs/config.ini" 7 | cache_gguf_dir = os.path.join(os.getcwd(), "src/quantized_model") 8 | cache_original_dir = os.path.join(os.getcwd(), "src/original_model") 9 | 10 | def format_gguf_model_name(file_name): 11 | parts = file_name.replace('.gguf', '').split("__") 12 | return "/".join(parts) 13 | 14 | def list_converted_gguf_models(cache_gguf_dir): 15 | contents = os.listdir(cache_gguf_dir) 16 | model_files = [format_gguf_model_name(item) for item in contents] 17 | return model_files 18 | 19 | def removeModelFromCache(model_name): 20 | config = ConfigParser() 21 | config.read(config_path) 22 | repo_id = config.get('Settings', 'repo_id') 23 | if model_name == repo_id: 24 | raise gr.Error("Can not delete default model") 25 | else: 26 | gguf_model_name = model_name.replace("/", "__") + ".gguf" 27 | original_model_parts = model_name.split("/") 28 | original_model_name = f"model--{'--'.join(original_model_parts)}" 29 | try: 30 | os.remove(os.path.join(cache_gguf_dir, gguf_model_name)) 31 | shutil.rmtree(os.path.join(cache_original_dir, original_model_name)) 32 | except FileNotFoundError: 33 | raise gr.Error("Model not found in cache.") 34 | 35 | def read_config(): 36 | config = ConfigParser() 37 | config.read(config_path) 38 | if config.get('Settings', 'repo_id') == "" and config.get('Settings', 'execution_provider') == "": 39 | return None, config 40 | else: 41 | return config, config 42 | 43 | def update_config(config, **kwargs): 44 | for key, value in kwargs.items(): 45 | config.set('Settings', key, value) 46 | with open(config_path, 'w') as configfile: 47 | config.write(configfile) -------------------------------------------------------------------------------- /example/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | LLMinator 7 | 8 | 9 |

LLMinator

10 |
11 | 12 | 13 |
14 | 15 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from fastapi import FastAPI, WebSocket 3 | from src import quantize 4 | from langchain import PromptTemplate 5 | from langchain_community.llms import LlamaCpp 6 | from langchain.callbacks.manager import CallbackManager 7 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 8 | from langchain_core.prompts import PromptTemplate 9 | from core import default_repo_id 10 | 11 | app = FastAPI() 12 | 13 | # Callbacks support token-wise streaming 14 | callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) 15 | 16 | #check if cuda is available 17 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 18 | 19 | n_gpu_layers = None 20 | if device == "cuda": 21 | n_gpu_layers = -1 22 | else: 23 | n_gpu_layers = 0 24 | n_ctx = 6000 25 | n_batch = 30 26 | n_parts = 1 27 | temperature = 0.9 28 | max_tokens = 500 29 | 30 | def snapshot_download_and_convert_to_gguf(repo_id): 31 | gguf_model_path = quantize.quantize_model(repo_id) 32 | return gguf_model_path 33 | 34 | def init_llm_chain(model_path): 35 | llm = LlamaCpp( 36 | model_path=model_path, 37 | n_gpu_layers=n_gpu_layers, 38 | n_ctx=n_ctx, 39 | n_batch=n_batch, 40 | temperature=temperature, 41 | max_tokens=max_tokens, 42 | n_parts=n_parts, 43 | callback_manager=callback_manager, 44 | verbose=True 45 | ) 46 | 47 | template = """Question: {question} 48 | Answer: Let's work this out in a step by step way to be sure we have the right answer.""" 49 | 50 | prompt = PromptTemplate.from_template(template) 51 | llm_chain = prompt | llm 52 | return llm_chain, llm 53 | 54 | model_path = snapshot_download_and_convert_to_gguf(default_repo_id) 55 | 56 | llm_chain, llm = init_llm_chain(model_path) 57 | 58 | @app.websocket("/generate") 59 | async def websocket_endpoint(websocket: WebSocket): 60 | await websocket.accept() 61 | while True: 62 | prompt = await websocket.receive_text() 63 | 64 | async def bot(prompt): 65 | print("Question: ", prompt) 66 | output = llm_chain.stream(prompt) 67 | print("stream:", output) 68 | for character in output: 69 | print(character) 70 | await websocket.send_text(character) 71 | 72 | await bot(prompt) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gradio==4.27.0 2 | huggingface_hub==0.21.1 3 | langchain==0.1.14 4 | torch==2.1.2 5 | llama-cpp-python==0.2.76 6 | fastapi==0.110.1 -------------------------------------------------------------------------------- /src/quantize.py: -------------------------------------------------------------------------------- 1 | import subprocess, os 2 | from huggingface_hub import snapshot_download 3 | from configparser import ConfigParser 4 | 5 | config_path = "./configs/config.ini" 6 | 7 | def get_py_cmd(): 8 | config = ConfigParser() 9 | config.read(config_path) 10 | py_cmd = config.get('Settings', 'py_cmd') 11 | if "python3" in py_cmd: 12 | return 'python3' 13 | else: 14 | return 'python' 15 | 16 | def quantize_model(repo_id): 17 | original_models_path = "./src/original_model/" 18 | quantized_path = "./src/quantized_model/" 19 | 20 | repo_id_parts = repo_id.split("/") 21 | model_folder = f"model--{'--'.join(repo_id_parts)}" 22 | model_path = original_models_path + model_folder 23 | 24 | outfile = quantized_path + repo_id.replace("/", "__") + ".gguf" 25 | 26 | if os.path.isfile(outfile): 27 | return outfile 28 | 29 | snapshot_download(repo_id=repo_id, local_dir=model_path , local_dir_use_symlinks=True) 30 | 31 | command = [ 32 | get_py_cmd(), 33 | './src/llama_cpp/convert-hf-to-gguf.py', 34 | model_path, 35 | '--outtype', 'f16', 36 | '--outfile', outfile 37 | ] 38 | 39 | subprocess.run(command, check=True) 40 | 41 | return outfile 42 | -------------------------------------------------------------------------------- /style.css: -------------------------------------------------------------------------------- 1 | #chatbot-container { 2 | min-height: calc(100vh - 200px); 3 | } 4 | 5 | #title-container { 6 | max-height: 50px; 7 | } 8 | 9 | #configs-container { 10 | max-width: 700px; 11 | } -------------------------------------------------------------------------------- /webui.py: -------------------------------------------------------------------------------- 1 | import os, torch, argparse, asyncio, websockets, threading 2 | import gradio as gr 3 | from src import quantize 4 | from langchain import PromptTemplate 5 | from langchain_community.llms import LlamaCpp 6 | from langchain.callbacks.manager import CallbackManager 7 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 8 | from langchain_core.prompts import PromptTemplate 9 | from core import list_converted_gguf_models, default_repo_id, read_config, update_config, removeModelFromCache 10 | import sys 11 | 12 | sys.path.append('./src/llama_cpp/') 13 | sys.path.append('./src/') 14 | 15 | cache_gguf_dir = os.path.join(os.getcwd(), "src/quantized_model") 16 | 17 | # Callbacks support token-wise streaming 18 | callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) 19 | 20 | #check if cuda is available 21 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 22 | 23 | state, config = read_config() 24 | if state == None: 25 | config.set('Settings', 'execution_provider', device) 26 | config.set('Settings', 'repo_id', default_repo_id) 27 | update_config(config) 28 | else: 29 | default_repo_id = config.get('Settings', 'repo_id') 30 | device = config.get('Settings', 'execution_provider') 31 | 32 | def snapshot_download_and_convert_to_gguf(repo_id): 33 | gguf_model_path = quantize.quantize_model(repo_id) 34 | return gguf_model_path 35 | 36 | n_gpu_layers = None 37 | if device == "cuda": 38 | n_gpu_layers = -1 39 | else: 40 | n_gpu_layers = 0 41 | n_ctx = 6000 42 | n_batch = 30 43 | n_parts = 1 44 | temperature = 0.9 45 | max_tokens = 4095 46 | 47 | def init_llm_chain(model_path): 48 | llm = LlamaCpp( 49 | model_path=model_path, 50 | n_gpu_layers=n_gpu_layers, 51 | n_ctx=n_ctx, 52 | n_batch=n_batch, 53 | temperature=temperature, 54 | max_tokens=max_tokens, 55 | n_parts=n_parts, 56 | callback_manager=callback_manager, 57 | verbose=True 58 | ) 59 | 60 | template = """Question: {question} 61 | Answer: Let's work this out in a step by step way to be sure we have the right answer.""" 62 | 63 | prompt = PromptTemplate.from_template(template) 64 | llm_chain = prompt | llm 65 | return llm_chain, llm 66 | 67 | def parse_args(): 68 | parser = argparse.ArgumentParser(description='Optional arguments for --host & --port.') 69 | parser.add_argument('--host', type=str, default='0.0.0.0', help='The host IP to run the server on.') 70 | parser.add_argument('--port', type=int, default=7860, help='The port to run the server on.') 71 | parser.add_argument('--share', type=bool, default=False, help='To create a public link.') 72 | return parser.parse_args() 73 | 74 | args = parse_args() 75 | 76 | model_path = snapshot_download_and_convert_to_gguf(default_repo_id) 77 | 78 | async def generate(websocket): 79 | async for message in websocket: 80 | output = llm_chain.stream(message) 81 | for character in output: 82 | await asyncio.sleep(0) 83 | await websocket.send(character) 84 | 85 | async def start_websockets(): 86 | print(f"Starting WebSocket server on port 7861 ...") 87 | async with websockets.serve(generate, "localhost", 7861): 88 | await asyncio.Future() 89 | 90 | async def main(): 91 | await start_websockets() 92 | 93 | with gr.Blocks(css='style.css') as demo: 94 | with gr.Tabs(selected="chat") as tabs: 95 | with gr.Tab("Chat", id="chat"): 96 | with gr.Row(): 97 | with gr.Column(scale=1): 98 | title = gr.Button( 99 | value="LLMinator", 100 | scale=1, 101 | variant="primary", 102 | interactive=True, 103 | elem_id="title-container" 104 | ) 105 | converted_models_chat = gr.Dropdown( 106 | choices=list_converted_gguf_models(cache_gguf_dir), 107 | value=default_repo_id, 108 | max_choices=5, 109 | filterable=True, 110 | info="Default: stabilityai/stable-code-instruct-3b", 111 | label="Selected Model", 112 | ) 113 | with gr.Group(): 114 | execution_provider = gr.Radio( 115 | ["cuda", "cpu"], 116 | value=device, 117 | label="Execution providers", 118 | info="Select Device" 119 | ) 120 | 121 | with gr.Column(scale=4): 122 | with gr.Group(): 123 | chatbot = gr.Chatbot(elem_id="chatbot-container") 124 | msg = gr.Textbox(label="Prompt") 125 | stop = gr.Button("Stop") 126 | 127 | with gr.Tab("Models", id="models"): 128 | with gr.Row(): 129 | with gr.Column(): 130 | with gr.Group(): 131 | model_repo_id = gr.Textbox( 132 | value="", 133 | label="Hugging Face Repo", 134 | info="Default: stabilityai/stable-code-instruct-3b", 135 | interactive=True 136 | ) 137 | format_choice = gr.Dropdown( 138 | choices=["gguf"], 139 | value="gguf", 140 | label="Convert Format", 141 | interactive=True 142 | ) 143 | download_convert_btn = gr.Button( 144 | value="Download Snapshot & Convert", 145 | variant="secondary", 146 | interactive=True 147 | ) 148 | with gr.Row(): 149 | with gr.Group(): 150 | converted_models = gr.Dropdown( 151 | choices=list_converted_gguf_models(cache_gguf_dir), 152 | value=default_repo_id, 153 | max_choices=5, 154 | filterable=True, 155 | info="gguf models available in the disk", 156 | label="Converted Models", 157 | interactive=True 158 | ) 159 | send_to_chat_btn = gr.Button( 160 | value="Send to Chat", 161 | variant="secondary", 162 | interactive=True 163 | ) 164 | 165 | with gr.Group(): 166 | saved_gguf_models = gr.Dropdown( 167 | choices=list_converted_gguf_models(cache_gguf_dir), 168 | max_choices=5, 169 | filterable=True, 170 | info="gguf models available in the disk", 171 | label="Remove Models", 172 | interactive=True 173 | ) 174 | remove_model_btn = gr.Button( 175 | value="Remove Model", 176 | variant="danger", 177 | interactive=True 178 | ) 179 | with gr.Tab("Configs", id="configs"): 180 | with gr.Row(): 181 | with gr.Column(elem_id="configs-container"): 182 | n_gpu_layers_input = gr.Slider(0, 5000, value=5000, step=1, label="n_gpu_layers", visible=torch.cuda.is_available(), interactive= True) 183 | n_ctx_input = gr.Slider(100, 6000, value=6000, label="n_ctx", interactive= True) 184 | n_batch_input = gr.Slider(1, 512, value=30, label="n_batch", visible=torch.cuda.is_available(), interactive= True) 185 | n_parts_input = gr.Slider(1, 10, step=1, value=1, label="n_parts", interactive= True) 186 | temperature_input = gr.Slider(0.1, 1, step=0.1, value=0.9, label="temperature", interactive= True) 187 | max_tokens_input = gr.Slider(1, 4095, value=4095, label="max_tokens", interactive= True) 188 | 189 | with gr.Row(): 190 | config_update_btn = gr.Button( 191 | value="Update Configs", 192 | variant="primary", 193 | interactive=True 194 | ) 195 | 196 | config_reset_btn = gr.Button( 197 | value="Reset Configs", 198 | variant="primary", 199 | interactive=True 200 | ) 201 | 202 | llm_chain, llm = init_llm_chain(model_path) 203 | 204 | def updateConfigs(n_gpu_layers_input, n_ctx_input, n_batch_input, n_parts_input, temperature_input, max_tokens_input): 205 | global n_gpu_layers, n_ctx, n_batch, n_parts, temperature, max_tokens, llm_chain, llm 206 | if torch.cuda.is_available(): 207 | n_gpu_layers = n_gpu_layers_input 208 | if n_batch_input < n_ctx_input: 209 | n_batch = n_batch_input 210 | else: 211 | raise gr.Error("n_batch should be between 1 and n_ctx") 212 | else: 213 | n_gpu_layers = 0 214 | n_ctx = 30 215 | 216 | n_ctx = n_ctx_input 217 | n_parts = n_parts_input 218 | temperature = temperature_input 219 | max_tokens = max_tokens_input 220 | llm_chain, llm = init_llm_chain(model_path) 221 | return gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.Tabs(selected="chat") 222 | 223 | def resetConfigs(): 224 | global n_gpu_layers, n_ctx, n_batch, n_parts, temperature, max_tokens, llm_chain, llm 225 | n_gpu_layers = 0 226 | n_ctx = 6000 227 | n_batch = 30 228 | n_parts = 1 229 | temperature = 0.9 230 | max_tokens = 4095 231 | llm_chain, llm = init_llm_chain(model_path) 232 | return gr.update(value="0"), gr.update(value="6000"), gr.update(value="30"), gr.update(value="1"), gr.update(value="0.9"), gr.update(value="4095") 233 | 234 | def updateExecutionProvider(provider, gguf_model): 235 | global device 236 | if provider == "cuda": 237 | if torch.cuda.is_available(): 238 | device = "cuda" 239 | else: 240 | raise gr.Error("Torch not compiled with CUDA enabled. Please make sure cuda is installed.") 241 | 242 | else: 243 | device = "cpu" 244 | 245 | update_config(config, execution_provider=provider) 246 | loadModel(gguf_model) 247 | return gr.update(value=device) 248 | 249 | def removeModel(model_name): 250 | removeModelFromCache(model_name) 251 | return gr.update(choices=list_converted_gguf_models(cache_gguf_dir)), gr.update(choices=list_converted_gguf_models(cache_gguf_dir)), gr.update(choices=list_converted_gguf_models(cache_gguf_dir)) 252 | 253 | def user(user_message, history): 254 | return "", history + [[user_message, None]] 255 | 256 | def downloadConvertModel(model_repo_id): 257 | if model_repo_id: 258 | snapshot_download_and_convert_to_gguf(model_repo_id) 259 | return gr.update(value=""), gr.update(choices=list_converted_gguf_models(cache_gguf_dir)), gr.update(choices=list_converted_gguf_models(cache_gguf_dir)), gr.update(choices=list_converted_gguf_models(cache_gguf_dir)) 260 | else: 261 | raise gr.Error("Repo can not be empty!") 262 | 263 | def loadModel(repo_id): 264 | global llm_chain, llm 265 | model_path = snapshot_download_and_convert_to_gguf(repo_id) 266 | llm_chain, llm = init_llm_chain(model_path) 267 | update_config(config, repo_id=repo_id) 268 | 269 | def loadModelFromModelsTab(model_repo_id): 270 | loadModel(model_repo_id) 271 | return gr.update(value=model_repo_id), gr.Tabs(selected="chat") 272 | 273 | def loadModelFromChatTab(repo_id): 274 | loadModel(repo_id) 275 | return gr.update(value=repo_id) 276 | 277 | def bot(history): 278 | print("Question: ", history[-1][0]) 279 | output = llm_chain.stream(history[-1][0]) 280 | print("stream:", output) 281 | history[-1][1] = "" 282 | for character in output: 283 | # print(character) 284 | history[-1][1] += character 285 | yield history 286 | 287 | submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot) 288 | # stop.click(None, None, None, cancels=[submit_event], queue=False) 289 | download_convert_btn.click(downloadConvertModel, model_repo_id, [model_repo_id, converted_models_chat, converted_models, saved_gguf_models], queue=False, show_progress="full") 290 | send_to_chat_btn.click(loadModelFromModelsTab, converted_models, [converted_models_chat, tabs], queue=False, show_progress="full") 291 | converted_models_chat.change(loadModelFromChatTab, converted_models_chat, converted_models_chat, queue=False, show_progress="full") 292 | remove_model_btn.click(removeModel, saved_gguf_models, [saved_gguf_models, converted_models_chat, converted_models], queue=False, show_progress="full") 293 | execution_provider.change(updateExecutionProvider, [execution_provider, converted_models_chat], execution_provider, queue=False, show_progress="full") 294 | config_update_btn.click(updateConfigs, [n_gpu_layers_input, n_ctx_input, n_batch_input, n_parts_input, temperature_input, max_tokens_input], [n_gpu_layers_input, n_ctx_input, n_batch_input, n_parts_input, temperature_input, max_tokens_input, tabs], queue=False, show_progress="full") 295 | config_reset_btn.click(resetConfigs, None, [n_gpu_layers_input, n_ctx_input, n_batch_input, n_parts_input, temperature_input, max_tokens_input], queue=False, show_progress="full") 296 | 297 | demo.queue() 298 | 299 | def launch_demo(): 300 | demo.launch(server_name=args.host, server_port=args.port, share=args.share) 301 | 302 | if __name__ == "__main__": 303 | threading.Thread(target=launch_demo).start() 304 | asyncio.run(main()) --------------------------------------------------------------------------------