├── requirements.txt
├── llmx
    ├── __init__.py
    └── cli.py
├── pyproject.toml
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.31.0
2 | rich>=13.0.0
3 | mlx-lm>=0.0.3
4 | huggingface-hub>=0.20.0 


--------------------------------------------------------------------------------
/llmx/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | LLMX - CLI tool for managing MLX-LM models and servers
3 | """
4 | 
5 | __version__ = "0.1.0" 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "llmx"
 3 | version = "0.1.0"
 4 | description = "CLI tool for managing MLX-LM models and servers"
 5 | authors = [
 6 |     {name = "Your Name", email = "your.email@example.com"},
 7 | ]
 8 | readme = "README.md"
 9 | requires-python = ">=3.8"
10 | classifiers = [
11 |     "Programming Language :: Python :: 3",
12 |     "License :: OSI Approved :: MIT License",
13 |     "Operating System :: OS Independent",
14 | ]
15 | 
16 | dependencies = [
17 |     "requests>=2.31.0",
18 |     "rich>=13.0.0",
19 |     "mlx-lm>=0.0.3",
20 |     "huggingface-hub>=0.20.0",
21 | ]
22 | 
23 | [project.scripts]
24 | llmx = "llmx.cli:main"
25 | 
26 | [build-system]
27 | requires = ["hatchling"]
28 | build-backend = "hatchling.build"
29 | 
30 | [tool.hatch.metadata]
31 | allow-direct-references = true 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # llmx
  2 | 
  3 | A CLI tool for managing MLX-LM models and servers. LLMX provides a convenient interface for downloading, managing, and serving MLX-LM models with OpenAI-compatible API endpoints.
  4 | 
  5 | ## Installation
  6 | 
  7 | ### Quick Install (Recommended)
  8 | 
  9 | Install directly using `uv`:
 10 | ```bash
 11 | uv venv --python 3.11
 12 | source .venv/bin/activate
 13 | uv pip install git+https://github.com/vaibhavs10/llmx.git
 14 | ```
 15 | 
 16 | ### Development Setup
 17 | 
 18 | If you want to develop or modify the code:
 19 | ```bash
 20 | # Clone the repository
 21 | git clone https://github.com/vaibhavs10/llmx.git
 22 | cd llmx
 23 | 
 24 | # Install in development mode with uv
 25 | uv pip install -e .
 26 | ```
 27 | 
 28 | ## Usage
 29 | 
 30 | LLMX provides several commands for managing MLX-LM models:
 31 | 
 32 | ### Available Commands
 33 | 
 34 | ```bash
 35 | # Start a model server
 36 | llmx serve <model_id> [--port PORT]
 37 | 
 38 | # Start an interactive chat session
 39 | llmx chat <model_id> [--port PORT] [--temperature TEMP]
 40 | 
 41 | # Stop a running model
 42 | llmx stop <port>
 43 | 
 44 | # Pull a model from Hugging Face
 45 | llmx pull <model_id>
 46 | 
 47 | # List downloaded models
 48 | llmx list
 49 | 
 50 | # List running models
 51 | llmx ps
 52 | 
 53 | # Get help
 54 | llmx help
 55 | ```
 56 | 
 57 | ### Example Usage
 58 | 
 59 | 1. Start an interactive chat session:
 60 | ```bash
 61 | llmx chat mlx-community/Mistral-7B-Instruct-v0.3-4bit --temperature 0.7
 62 | ```
 63 | 
 64 | 2. Start a model server (for API access):
 65 | ```bash
 66 | llmx serve mlx-community/Mistral-7B-Instruct-v0.3-4bit --port 8080
 67 | ```
 68 | 
 69 | 3. Make a request to the server:
 70 | ```bash
 71 | curl localhost:8080/v1/chat/completions \
 72 |   -H "Content-Type: application/json" \
 73 |   -d '{
 74 |      "messages": [{"role": "user", "content": "Say this is a test!"}],
 75 |      "temperature": 0.7
 76 |    }'
 77 | ```
 78 | 
 79 | 4. List running models:
 80 | ```bash
 81 | llmx ps
 82 | ```
 83 | 
 84 | 5. Stop the server:
 85 | ```bash
 86 | llmx stop 8080
 87 | ```
 88 | 
 89 | ## Interactive Chat Mode
 90 | 
 91 | The `chat` command provides an interactive chat interface where you can have a conversation with the model. Features include:
 92 | 
 93 | - Automatic server management (starts/stops as needed)
 94 | - Markdown rendering of responses
 95 | - Conversation history tracking
 96 | - Type 'exit' or press Ctrl+C to end the chat
 97 | - Option to keep the server running after chat ends
 98 | 
 99 | ## Model Storage
100 | 
101 | Models are stored in the Hugging Face Hub's default cache location (`~/.cache/huggingface/hub` on Unix systems). This allows for better integration with other tools and avoids duplicating storage. The `llmx` tool manages only the running state of models in `~/.llmx/running.json`.
102 | 
103 | ## Requirements
104 | 
105 | - Python 3.8 or higher
106 | - MLX-LM
107 | - Rich
108 | - Requests
109 | - Hugging Face Hub 


--------------------------------------------------------------------------------
/llmx/cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | LLMX CLI implementation
  3 | """
  4 | import os
  5 | import json
  6 | import argparse
  7 | from pathlib import Path
  8 | import subprocess
  9 | import signal
 10 | import requests
 11 | from rich.console import Console
 12 | from rich.table import Table
 13 | from rich.markdown import Markdown
 14 | from rich.prompt import Prompt, Confirm
 15 | from rich.panel import Panel
 16 | from huggingface_hub import snapshot_download
 17 | from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 18 | 
 19 | console = Console()
 20 | LLMX_HOME = os.path.expanduser("~/.llmx/running.json")
 21 | os.makedirs(os.path.dirname(LLMX_HOME), exist_ok=True)
 22 | 
 23 | def manage_running_models(action='load', port=None, server_info=None):
 24 |     """Unified function to manage running models state"""
 25 |     try:
 26 |         running_models = {}
 27 |         if os.path.exists(LLMX_HOME):
 28 |             with open(LLMX_HOME, 'r') as f:
 29 |                 running_models = json.load(f)
 30 |             
 31 |             # Clean up stale entries
 32 |             running_models = {
 33 |                 p: info for p, info in running_models.items()
 34 |                 if is_process_running(info.get("pid"))
 35 |             }
 36 |         
 37 |         if action == 'load':
 38 |             return running_models
 39 |         elif action == 'save':
 40 |             if server_info:
 41 |                 running_models[port] = server_info
 42 |             elif port in running_models:
 43 |                 del running_models[port]
 44 |             with open(LLMX_HOME, 'w') as f:
 45 |                 json.dump(running_models, f)
 46 |             return running_models
 47 |     except (json.JSONDecodeError, IOError):
 48 |         return {}
 49 | 
 50 | def is_process_running(pid):
 51 |     """Check if a process is running"""
 52 |     try:
 53 |         os.kill(pid, 0)
 54 |         return True
 55 |     except (ProcessLookupError, TypeError):
 56 |         return False
 57 | 
 58 | def manage_server(action, model_id=None, port=None):
 59 |     """Unified function to manage model servers"""
 60 |     running_models = manage_running_models('load')
 61 |     
 62 |     if action == 'start':
 63 |         if str(port) in running_models:
 64 |             console.print(f"[red]Port {port} is already in use[/red]")
 65 |             return None
 66 |             
 67 |         try:
 68 |             cmd = f"mlx_lm.server --model {model_id} --port {port}"
 69 |             process = subprocess.Popen(cmd.split(), start_new_session=True)
 70 |             server_info = {"model_id": model_id, "pid": process.pid, "port": port}
 71 |             manage_running_models('save', str(port), server_info)
 72 |             console.print(f"[green]Started model server for {model_id} on port {port}[/green]")
 73 |             return server_info
 74 |         except Exception as e:
 75 |             console.print(f"[red]Error starting server: {str(e)}[/red]")
 76 |             return None
 77 |             
 78 |     elif action == 'stop':
 79 |         try:
 80 |             os.killpg(os.getpgid(running_models[str(port)]["pid"]), signal.SIGTERM)
 81 |             manage_running_models('save', str(port))
 82 |             console.print(f"[green]Stopped server on port {port}[/green]")
 83 |             return True
 84 |         except (KeyError, ProcessLookupError):
 85 |             console.print(f"[red]No server running on port {port}[/red]")
 86 |             return False
 87 | 
 88 | def chat_session(port, temperature=0.7):
 89 |     """Run an interactive chat session"""
 90 |     messages = []
 91 |     try:
 92 |         while True:
 93 |             try:
 94 |                 user_input = Prompt.ask("[bold green]You[/bold green]")
 95 |                 if user_input.lower() == 'exit':
 96 |                     break
 97 |                     
 98 |                 messages.append({"role": "user", "content": user_input})
 99 |                 console.print(f"\n[bold purple]Assistant[/bold purple]")
100 |                 
101 |                 # Stream the response
102 |                 current_message = []
103 |                 with requests.post(
104 |                     f"http://localhost:{port}/v1/chat/completions",
105 |                     json={"messages": messages, "temperature": temperature, "stream": True},
106 |                     headers={"Content-Type": "application/json"},
107 |                     timeout=30,
108 |                     stream=True
109 |                 ) as response:
110 |                     if response.status_code != 200:
111 |                         console.print(f"[red]Error: Server returned status code {response.status_code}[/red]")
112 |                         continue
113 |                         
114 |                     for line in response.iter_lines():
115 |                         if not line:
116 |                             continue
117 |                         try:
118 |                             data = json.loads(line.decode('utf-8').removeprefix('data: '))
119 |                             if data.get("choices"):
120 |                                 chunk = data["choices"][0].get("delta", {}).get("content", "")
121 |                                 if chunk:
122 |                                     current_message.append(chunk)
123 |                                     console.print(chunk, end="")
124 |                         except json.JSONDecodeError:
125 |                             continue
126 |                 
127 |                 console.print()  # New line after response
128 |                 assistant_message = "".join(current_message)
129 |                 messages.append({"role": "assistant", "content": assistant_message})
130 |                     
131 |             except (KeyboardInterrupt, requests.exceptions.RequestException) as e:
132 |                 console.print(f"\n[red]Error: {str(e)}[/red]")
133 |                 break
134 |     finally:
135 |         return Confirm.ask("\nKeep server running?", default=False)
136 | 
137 | def main():
138 |     parser = argparse.ArgumentParser(description="LLMX - MLX-LM model management tool", add_help=False)
139 |     subparsers = parser.add_subparsers(dest='command')
140 | 
141 |     # Add command parsers
142 |     for cmd, help_text in {
143 |         'serve': 'Start a model server',
144 |         'chat': 'Start an interactive chat session',
145 |         'stop': 'Stop a running model',
146 |         'pull': 'Pull a model from Hugging Face',
147 |         'list': 'List downloaded models',
148 |         'ps': 'List running models',
149 |         'help': 'Show help message'
150 |     }.items():
151 |         cmd_parser = subparsers.add_parser(cmd, add_help=False, help=help_text)
152 |         if cmd in ['serve', 'chat']:
153 |             cmd_parser.add_argument('model_id')
154 |             cmd_parser.add_argument('--port', type=int, default=8080)
155 |             if cmd == 'chat':
156 |                 cmd_parser.add_argument('--temperature', type=float, default=0.7)
157 |         elif cmd == 'stop':
158 |             cmd_parser.add_argument('port')
159 |         elif cmd == 'pull':
160 |             cmd_parser.add_argument('model_id')
161 | 
162 |     args = parser.parse_args()
163 |     running_models = manage_running_models('load')
164 | 
165 |     if args.command in ['serve', 'chat']:
166 |         try:
167 |             model_path = snapshot_download(
168 |                 repo_id=args.model_id,
169 |                 ignore_patterns=["*.safetensors", "*.bin"]
170 |             )
171 |             if not model_path:
172 |                 return
173 |                 
174 |             port = int(getattr(args, 'port', 8080))
175 |             if server_info := manage_server('start', args.model_id, port):
176 |                 if args.command == 'chat':
177 |                     import time
178 |                     time.sleep(2)  # Give server time to start
179 |                     if not chat_session(port, getattr(args, 'temperature', 0.7)):
180 |                         manage_server('stop', port=port)
181 |                         
182 |         except Exception as e:
183 |             console.print(f"[red]Error: {str(e)}[/red]")
184 | 
185 |     elif args.command == 'stop':
186 |         manage_server('stop', port=args.port)
187 | 
188 |     elif args.command == 'pull':
189 |         try:
190 |             snapshot_download(
191 |                 repo_id=args.model_id,
192 |                 ignore_patterns=["*.safetensors", "*.bin"]
193 |             )
194 |         except Exception as e:
195 |             console.print(f"[red]Error pulling model: {str(e)}[/red]")
196 | 
197 |     elif args.command == 'list':
198 |         table = Table(show_header=True, header_style="bold magenta")
199 |         table.add_column("Model ID", style="green")
200 |         table.add_column("Size", justify="right")
201 |         
202 |         for model_dir in Path(HUGGINGFACE_HUB_CACHE).glob("models--*"):
203 |             try:
204 |                 parts = model_dir.name.split('--')
205 |                 model_id = f"{parts[1]}/{parts[2]}" if len(parts) == 3 else parts[1]
206 |                 size = sum(f.stat(follow_symlinks=False).st_size for f in model_dir.glob('**/*') if f.is_file())
207 |                 table.add_row(model_id, f"{size / 1024 / 1024:.1f} MB")
208 |             except Exception:
209 |                 continue
210 |         console.print(table)
211 | 
212 |     elif args.command == 'ps':
213 |         if not running_models:
214 |             console.print("[yellow]No running models[/yellow]")
215 |             return
216 |             
217 |         table = Table(show_header=True, header_style="bold magenta")
218 |         table.add_column("Port")
219 |         table.add_column("Model ID")
220 |         table.add_column("Status")
221 | 
222 |         for port, info in running_models.items():
223 |             status = "[green]Running[/green]" if is_process_running(info["pid"]) else "[red]Stopped[/red]"
224 |             table.add_row(str(port), info["model_id"], status)
225 |         console.print(table)
226 | 
227 |     else:
228 |         help_text = """
229 | [bold]llmx Commands:[/bold]
230 | 
231 |   [green]serve[/green] <model_id> [--port PORT]      Start a model server
232 |   [green]chat[/green] <model_id> [--port PORT]       Start an interactive chat session
233 |   [green]stop[/green] <port>                         Stop a running model
234 |   [green]pull[/green] <model_id>                     Pull a model from Hugging Face
235 |   [green]list[/green]                                List downloaded models
236 |   [green]ps[/green]                                  List running models
237 |   [green]help[/green]                                Show this help message
238 | 
239 | [bold]Example:[/bold] llmx chat mlx-community/Mistral-7B-Instruct-v0.3-4bit --temperature 0.7
240 | """
241 |         console.print(Panel(help_text, title="LLMX Help", border_style="blue"))
242 | 
243 | if __name__ == '__main__':
244 |     main() 


--------------------------------------------------------------------------------