├── Dockerfile ├── LICENSE ├── README.MD ├── build_file.sh └── chat.py /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | ARG DEBIAN_VERSION=bullseye 4 | 5 | ################################################################################ 6 | # Use debian image as downloader image for final stage. 7 | # https://hub.docker.com/_/debian 8 | ################################################################################ 9 | FROM debian:${DEBIAN_VERSION}-slim AS downloader 10 | 11 | # Set working directory. 12 | WORKDIR /download 13 | 14 | # Install curl. 15 | RUN apt-get update && apt-get install -y curl 16 | 17 | # Download latest llamafile from github. 18 | RUN curl -L -o ./llamafile https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.14/llamafile-0.8.14 19 | 20 | # Make llamafile executable. 21 | RUN chmod +x ./llamafile 22 | 23 | ################################################################################ 24 | # Use debian image as final image. 25 | # https://hub.docker.com/_/debian 26 | ################################################################################ 27 | FROM debian:${DEBIAN_VERSION}-slim AS final 28 | 29 | # Create user to run llamafile as non-root. 30 | RUN addgroup --gid 1000 user 31 | RUN adduser --uid 1000 --gid 1000 --disabled-password --gecos "" user 32 | 33 | # Switch to user. 34 | USER user 35 | 36 | # Set working directory. 37 | WORKDIR /usr/src/app 38 | 39 | # Copy llamafile from downloader image. 40 | COPY --from=downloader /download/llamafile ./llamafile 41 | 42 | # Expose 8080 port. 43 | EXPOSE 8080 44 | 45 | # Set entrypoint. 46 | ENTRYPOINT ["/bin/sh", "/usr/src/app/llamafile"] 47 | 48 | #ENTRYPOINT ["./llamafile"] 49 | 50 | # Set default command. 51 | #CMD ["--server", "--host", "0.0.0.0", "-m", "${MODEL_PATH}"] 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Apache 2.0 License 2 | 3 | Copyright 2023 Mozilla Foundation 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # Llamafile: Simplified LLM Deployment 2 | 3 | ## Overview 4 | Llamafile is a revolutionary tool that converts Large Language Models (LLMs) into standalone executable files. This transformation offers several significant advantages: 5 | 6 | - **Enhanced Performance**: Achieve 30% to 500% performance improvement compared to Ollama 7 | - **CPU-Based Inference**: Run models efficiently on CPU architecture 8 | - **Streamlined Deployment**: Simple deployment process using this repository 9 | 10 | ## Prerequisites 11 | - Docker (installed and running) 12 | - Git 13 | - Unix-based terminal (Git Bash for Windows users) 14 | 15 | ## Installation 16 | 17 | 1. Clone this repository to your local machine: 18 | ```bash 19 | git clone https://github.com/brainqub3/llamafile_chat.git 20 | ``` 21 | 22 | 2. Navigate to the project directory and execute the build script: 23 | ```bash 24 | ./build_file.sh 25 | ``` 26 | 27 | ## Usage Options 28 | 29 | ### 1. Web Interface 30 | Access the built-in interface through your web browser: 31 | ``` 32 | http://127.0.0.1:8080 33 | ``` 34 | 35 | ### 2. API Integration 36 | Interact with the model programmatically via the API endpoint: 37 | ``` 38 | http://172.17.0.2:8080 39 | ``` 40 | 41 | For terminal-based interactions, we provide a Python script: 42 | ```bash 43 | python chat.py 44 | ``` 45 | This script facilitates direct communication with your model through the command line. 46 | 47 | ## Additional Resources 48 | 49 | - [Llamafile Technical Deep Dive](https://justine.lol/matmul/) - Comprehensive blog post explaining the technology 50 | - [Official GitHub Repository](https://github.com/Mozilla-Ocho/llamafile) - Source code and documentation -------------------------------------------------------------------------------- /build_file.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export MSYS_NO_PATHCONV=1 3 | 4 | # Prompt the user for the model URL 5 | read -p "Please enter the model URL: " model_url 6 | 7 | # Extract the model filename from the URL 8 | model_filename=$(basename "$model_url") 9 | 10 | # Create a directory to store the model 11 | mkdir -p ./model 12 | 13 | # Download the model if it doesn't already exist 14 | if [ ! -f "./model/$model_filename" ]; then 15 | echo "Downloading model..." 16 | curl -L -o "./model/$model_filename" "$model_url" 17 | else 18 | echo "Model already exists." 19 | fi 20 | 21 | # Build the Docker image 22 | echo "Building Docker image..." 23 | docker build --no-cache -t llamafile_image . 24 | 25 | # Run the Docker container, mounting the model directory 26 | echo "Running Docker container..." 27 | 28 | # For debugging 29 | echo "Model filename: $model_filename" 30 | echo "Current directory: $(pwd)" 31 | echo "Running Docker container with the following command:" 32 | echo "docker run -p 8080:8080 -v \"$(pwd)/model:/usr/src/app/model\" llamafile_image --server --host 0.0.0.0 -m \"/usr/src/app/model/$model_filename\"" 33 | 34 | docker run -p 8080:8080 \ 35 | -v "$(pwd)/model:/usr/src/app/model" \ 36 | llamafile_image \ 37 | --server \ 38 | --host 0.0.0.0 \ 39 | -m "/usr/src/app/model/$model_filename" -------------------------------------------------------------------------------- /chat.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import json 4 | 5 | class LlamaFileChat: 6 | def __init__(self, host='localhost', port=8080, api_key=None): 7 | self.llamafile_host = host 8 | self.llamafile_port = port 9 | self.api_key = api_key 10 | # Initialize the conversation with the system message 11 | self.conversation = [ 12 | { 13 | "role": "system", 14 | "content": "You are a helpful assistant responding in a friendly, casual, and jokey tone." 15 | } 16 | ] 17 | 18 | def call_llamafile_api(self, messages, stream=True): 19 | try: 20 | llamafile_api_url = f"http://{self.llamafile_host}:{self.llamafile_port}/v1/chat/completions" 21 | headers = { 22 | "Content-Type": "application/json", 23 | "Authorization": f"Bearer {self.api_key or 'no-key-required'}" 24 | } 25 | payload = { 26 | "model": "LLaMA_CPP", 27 | "messages": messages, 28 | "stream": stream 29 | } 30 | start_time = time.time() 31 | response = requests.post(llamafile_api_url, headers=headers, json=payload, stream=stream) 32 | 33 | if response.status_code != 200: 34 | print(f"Error: Received status code {response.status_code}") 35 | print(f"Response Content: {response.content.decode('utf-8')}") 36 | return None 37 | 38 | if stream: 39 | def generate(): 40 | first_chunk_time_recorded = False 41 | for chunk in response.iter_content(chunk_size=None): 42 | if chunk: 43 | if not first_chunk_time_recorded: 44 | first_chunk_time = time.time() 45 | ttft = first_chunk_time - start_time 46 | print(f"\nTime to First Token: {ttft:.2f} seconds\n") 47 | first_chunk_time_recorded = True 48 | decoded_chunk = chunk.decode('utf-8') 49 | # Split the chunk into lines in case multiple data entries are in the chunk 50 | lines = decoded_chunk.strip().split('\n') 51 | for line in lines: 52 | if line.startswith('data: '): 53 | data_str = line[len('data: '):] 54 | if data_str.strip() == '[DONE]': 55 | return 56 | try: 57 | data = json.loads(data_str) 58 | delta_content = data.get('choices', [{}])[0].get('delta', {}).get('content', '') 59 | yield delta_content 60 | except json.JSONDecodeError as e: 61 | print(f"JSONDecodeError: {e} - Line Content: {data_str}") 62 | else: 63 | pass # Ignore irrelevant lines 64 | return generate() 65 | else: 66 | response_json = response.json() 67 | ttft = time.time() - start_time 68 | print(f"\nTime to First Token: {ttft:.2f} seconds\n") 69 | return response_json.get('choices', [{}])[0].get('message', {}).get('content', '') 70 | except Exception as e: 71 | print(f"Error while calling Llamafile API: {e}") 72 | return None 73 | 74 | def main(): 75 | chat = LlamaFileChat() 76 | print("Welcome to Llamafile Chat! Type 'exit' to quit.\n") 77 | while True: 78 | user_input = input("You: ") 79 | if user_input.lower() in ['exit', 'quit']: 80 | print("Goodbye!") 81 | break 82 | chat.conversation.append({"role": "user", "content": user_input}) 83 | print("Assistant:", end=' ', flush=True) 84 | response_generator = chat.call_llamafile_api(chat.conversation) 85 | if response_generator: 86 | assistant_response = '' 87 | for chunk in response_generator: 88 | print(chunk, end='', flush=True) 89 | assistant_response += chunk 90 | print() 91 | chat.conversation.append({"role": "assistant", "content": assistant_response}) 92 | else: 93 | print("Failed to get a response from Llamafile API.") 94 | 95 | if __name__ == "__main__": 96 | main() --------------------------------------------------------------------------------