├── Dockerfile
├── LICENSE
├── README.MD
├── build_file.sh
└── chat.py


/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | 
 3 | ARG DEBIAN_VERSION=bullseye
 4 | 
 5 | ################################################################################
 6 | # Use debian image as downloader image for final stage.
 7 | # https://hub.docker.com/_/debian
 8 | ################################################################################
 9 | FROM debian:${DEBIAN_VERSION}-slim AS downloader
10 | 
11 | # Set working directory.
12 | WORKDIR /download
13 | 
14 | # Install curl.
15 | RUN apt-get update && apt-get install -y curl
16 | 
17 | # Download latest llamafile from github.
18 | RUN curl -L -o ./llamafile https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.14/llamafile-0.8.14
19 | 
20 | # Make llamafile executable.
21 | RUN chmod +x ./llamafile
22 | 
23 | ################################################################################
24 | # Use debian image as final image.
25 | # https://hub.docker.com/_/debian
26 | ################################################################################
27 | FROM debian:${DEBIAN_VERSION}-slim AS final
28 | 
29 | # Create user to run llamafile as non-root.
30 | RUN addgroup --gid 1000 user
31 | RUN adduser --uid 1000 --gid 1000 --disabled-password --gecos "" user
32 | 
33 | # Switch to user.
34 | USER user
35 | 
36 | # Set working directory.
37 | WORKDIR /usr/src/app
38 | 
39 | # Copy llamafile from downloader image.
40 | COPY --from=downloader /download/llamafile ./llamafile
41 | 
42 | # Expose 8080 port.
43 | EXPOSE 8080
44 | 
45 | # Set entrypoint.
46 | ENTRYPOINT ["/bin/sh", "/usr/src/app/llamafile"]
47 | 
48 | #ENTRYPOINT ["./llamafile"]
49 | 
50 | # Set default command.
51 | #CMD ["--server", "--host", "0.0.0.0", "-m", "${MODEL_PATH}"]
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The Apache 2.0 License
 2 | 
 3 | Copyright 2023 Mozilla Foundation
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License");
 6 | you may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 |     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
 1 | # Llamafile: Simplified LLM Deployment
 2 | 
 3 | ## Overview
 4 | Llamafile is a revolutionary tool that converts Large Language Models (LLMs) into standalone executable files. This transformation offers several significant advantages:
 5 | 
 6 | - **Enhanced Performance**: Achieve 30% to 500% performance improvement compared to Ollama
 7 | - **CPU-Based Inference**: Run models efficiently on CPU architecture
 8 | - **Streamlined Deployment**: Simple deployment process using this repository
 9 | 
10 | ## Prerequisites
11 | - Docker (installed and running)
12 | - Git
13 | - Unix-based terminal (Git Bash for Windows users)
14 | 
15 | ## Installation
16 | 
17 | 1. Clone this repository to your local machine:
18 |    ```bash
19 |    git clone https://github.com/brainqub3/llamafile_chat.git
20 |    ```
21 | 
22 | 2. Navigate to the project directory and execute the build script:
23 |    ```bash
24 |    ./build_file.sh
25 |    ```
26 | 
27 | ## Usage Options
28 | 
29 | ### 1. Web Interface
30 | Access the built-in interface through your web browser:
31 | ```
32 | http://127.0.0.1:8080
33 | ```
34 | 
35 | ### 2. API Integration
36 | Interact with the model programmatically via the API endpoint:
37 | ```
38 | http://172.17.0.2:8080
39 | ```
40 | 
41 | For terminal-based interactions, we provide a Python script:
42 | ```bash
43 | python chat.py
44 | ```
45 | This script facilitates direct communication with your model through the command line.
46 | 
47 | ## Additional Resources
48 | 
49 | - [Llamafile Technical Deep Dive](https://justine.lol/matmul/) - Comprehensive blog post explaining the technology
50 | - [Official GitHub Repository](https://github.com/Mozilla-Ocho/llamafile) - Source code and documentation


--------------------------------------------------------------------------------
/build_file.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export MSYS_NO_PATHCONV=1
 3 | 
 4 | # Prompt the user for the model URL
 5 | read -p "Please enter the model URL: " model_url
 6 | 
 7 | # Extract the model filename from the URL
 8 | model_filename=$(basename "$model_url")
 9 | 
10 | # Create a directory to store the model
11 | mkdir -p ./model
12 | 
13 | # Download the model if it doesn't already exist
14 | if [ ! -f "./model/$model_filename" ]; then
15 |     echo "Downloading model..."
16 |     curl -L -o "./model/$model_filename" "$model_url"
17 | else
18 |     echo "Model already exists."
19 | fi
20 | 
21 | # Build the Docker image
22 | echo "Building Docker image..."
23 | docker build --no-cache -t llamafile_image .
24 | 
25 | # Run the Docker container, mounting the model directory
26 | echo "Running Docker container..."
27 | 
28 | # For debugging
29 | echo "Model filename: $model_filename"
30 | echo "Current directory: $(pwd)"
31 | echo "Running Docker container with the following command:"
32 | echo "docker run -p 8080:8080 -v \"$(pwd)/model:/usr/src/app/model\" llamafile_image --server --host 0.0.0.0 -m \"/usr/src/app/model/$model_filename\""
33 | 
34 | docker run -p 8080:8080 \
35 |     -v "$(pwd)/model:/usr/src/app/model" \
36 |     llamafile_image \
37 |     --server \
38 |     --host 0.0.0.0 \
39 |     -m "/usr/src/app/model/$model_filename"


--------------------------------------------------------------------------------
/chat.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import time
 3 | import json
 4 | 
 5 | class LlamaFileChat:
 6 |     def __init__(self, host='localhost', port=8080, api_key=None):
 7 |         self.llamafile_host = host
 8 |         self.llamafile_port = port
 9 |         self.api_key = api_key
10 |         # Initialize the conversation with the system message
11 |         self.conversation = [
12 |             {
13 |                 "role": "system",
14 |                 "content": "You are a helpful assistant responding in a friendly, casual, and jokey tone."
15 |             }
16 |         ]
17 | 
18 |     def call_llamafile_api(self, messages, stream=True):
19 |         try:
20 |             llamafile_api_url = f"http://{self.llamafile_host}:{self.llamafile_port}/v1/chat/completions"
21 |             headers = {
22 |                 "Content-Type": "application/json",
23 |                 "Authorization": f"Bearer {self.api_key or 'no-key-required'}"
24 |             }
25 |             payload = {
26 |                 "model": "LLaMA_CPP",
27 |                 "messages": messages,
28 |                 "stream": stream
29 |             }
30 |             start_time = time.time()
31 |             response = requests.post(llamafile_api_url, headers=headers, json=payload, stream=stream)
32 |             
33 |             if response.status_code != 200:
34 |                 print(f"Error: Received status code {response.status_code}")
35 |                 print(f"Response Content: {response.content.decode('utf-8')}")
36 |                 return None
37 | 
38 |             if stream:
39 |                 def generate():
40 |                     first_chunk_time_recorded = False
41 |                     for chunk in response.iter_content(chunk_size=None):
42 |                         if chunk:
43 |                             if not first_chunk_time_recorded:
44 |                                 first_chunk_time = time.time()
45 |                                 ttft = first_chunk_time - start_time
46 |                                 print(f"\nTime to First Token: {ttft:.2f} seconds\n")
47 |                                 first_chunk_time_recorded = True
48 |                             decoded_chunk = chunk.decode('utf-8')
49 |                             # Split the chunk into lines in case multiple data entries are in the chunk
50 |                             lines = decoded_chunk.strip().split('\n')
51 |                             for line in lines:
52 |                                 if line.startswith('data: '):
53 |                                     data_str = line[len('data: '):]
54 |                                     if data_str.strip() == '[DONE]':
55 |                                         return
56 |                                     try:
57 |                                         data = json.loads(data_str)
58 |                                         delta_content = data.get('choices', [{}])[0].get('delta', {}).get('content', '')
59 |                                         yield delta_content
60 |                                     except json.JSONDecodeError as e:
61 |                                         print(f"JSONDecodeError: {e} - Line Content: {data_str}")
62 |                                 else:
63 |                                     pass  # Ignore irrelevant lines
64 |                 return generate()
65 |             else:
66 |                 response_json = response.json()
67 |                 ttft = time.time() - start_time
68 |                 print(f"\nTime to First Token: {ttft:.2f} seconds\n")
69 |                 return response_json.get('choices', [{}])[0].get('message', {}).get('content', '')
70 |         except Exception as e:
71 |             print(f"Error while calling Llamafile API: {e}")
72 |             return None
73 | 
74 | def main():
75 |     chat = LlamaFileChat()
76 |     print("Welcome to Llamafile Chat! Type 'exit' to quit.\n")
77 |     while True:
78 |         user_input = input("You: ")
79 |         if user_input.lower() in ['exit', 'quit']:
80 |             print("Goodbye!")
81 |             break
82 |         chat.conversation.append({"role": "user", "content": user_input})
83 |         print("Assistant:", end=' ', flush=True)
84 |         response_generator = chat.call_llamafile_api(chat.conversation)
85 |         if response_generator:
86 |             assistant_response = ''
87 |             for chunk in response_generator:
88 |                 print(chunk, end='', flush=True)
89 |                 assistant_response += chunk
90 |             print()
91 |             chat.conversation.append({"role": "assistant", "content": assistant_response})
92 |         else:
93 |             print("Failed to get a response from Llamafile API.")
94 | 
95 | if __name__ == "__main__":
96 |     main()


--------------------------------------------------------------------------------