├── requirements.txt ├── llmx ├── __init__.py └── cli.py ├── pyproject.toml └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.31.0 2 | rich>=13.0.0 3 | mlx-lm>=0.0.3 4 | huggingface-hub>=0.20.0 -------------------------------------------------------------------------------- /llmx/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | LLMX - CLI tool for managing MLX-LM models and servers 3 | """ 4 | 5 | __version__ = "0.1.0" -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "llmx" 3 | version = "0.1.0" 4 | description = "CLI tool for managing MLX-LM models and servers" 5 | authors = [ 6 | {name = "Your Name", email = "your.email@example.com"}, 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.8" 10 | classifiers = [ 11 | "Programming Language :: Python :: 3", 12 | "License :: OSI Approved :: MIT License", 13 | "Operating System :: OS Independent", 14 | ] 15 | 16 | dependencies = [ 17 | "requests>=2.31.0", 18 | "rich>=13.0.0", 19 | "mlx-lm>=0.0.3", 20 | "huggingface-hub>=0.20.0", 21 | ] 22 | 23 | [project.scripts] 24 | llmx = "llmx.cli:main" 25 | 26 | [build-system] 27 | requires = ["hatchling"] 28 | build-backend = "hatchling.build" 29 | 30 | [tool.hatch.metadata] 31 | allow-direct-references = true -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # llmx 2 | 3 | A CLI tool for managing MLX-LM models and servers. LLMX provides a convenient interface for downloading, managing, and serving MLX-LM models with OpenAI-compatible API endpoints. 4 | 5 | ## Installation 6 | 7 | ### Quick Install (Recommended) 8 | 9 | Install directly using `uv`: 10 | ```bash 11 | uv venv --python 3.11 12 | source .venv/bin/activate 13 | uv pip install git+https://github.com/vaibhavs10/llmx.git 14 | ``` 15 | 16 | ### Development Setup 17 | 18 | If you want to develop or modify the code: 19 | ```bash 20 | # Clone the repository 21 | git clone https://github.com/vaibhavs10/llmx.git 22 | cd llmx 23 | 24 | # Install in development mode with uv 25 | uv pip install -e . 26 | ``` 27 | 28 | ## Usage 29 | 30 | LLMX provides several commands for managing MLX-LM models: 31 | 32 | ### Available Commands 33 | 34 | ```bash 35 | # Start a model server 36 | llmx serve [--port PORT] 37 | 38 | # Start an interactive chat session 39 | llmx chat [--port PORT] [--temperature TEMP] 40 | 41 | # Stop a running model 42 | llmx stop 43 | 44 | # Pull a model from Hugging Face 45 | llmx pull 46 | 47 | # List downloaded models 48 | llmx list 49 | 50 | # List running models 51 | llmx ps 52 | 53 | # Get help 54 | llmx help 55 | ``` 56 | 57 | ### Example Usage 58 | 59 | 1. Start an interactive chat session: 60 | ```bash 61 | llmx chat mlx-community/Mistral-7B-Instruct-v0.3-4bit --temperature 0.7 62 | ``` 63 | 64 | 2. Start a model server (for API access): 65 | ```bash 66 | llmx serve mlx-community/Mistral-7B-Instruct-v0.3-4bit --port 8080 67 | ``` 68 | 69 | 3. Make a request to the server: 70 | ```bash 71 | curl localhost:8080/v1/chat/completions \ 72 | -H "Content-Type: application/json" \ 73 | -d '{ 74 | "messages": [{"role": "user", "content": "Say this is a test!"}], 75 | "temperature": 0.7 76 | }' 77 | ``` 78 | 79 | 4. List running models: 80 | ```bash 81 | llmx ps 82 | ``` 83 | 84 | 5. Stop the server: 85 | ```bash 86 | llmx stop 8080 87 | ``` 88 | 89 | ## Interactive Chat Mode 90 | 91 | The `chat` command provides an interactive chat interface where you can have a conversation with the model. Features include: 92 | 93 | - Automatic server management (starts/stops as needed) 94 | - Markdown rendering of responses 95 | - Conversation history tracking 96 | - Type 'exit' or press Ctrl+C to end the chat 97 | - Option to keep the server running after chat ends 98 | 99 | ## Model Storage 100 | 101 | Models are stored in the Hugging Face Hub's default cache location (`~/.cache/huggingface/hub` on Unix systems). This allows for better integration with other tools and avoids duplicating storage. The `llmx` tool manages only the running state of models in `~/.llmx/running.json`. 102 | 103 | ## Requirements 104 | 105 | - Python 3.8 or higher 106 | - MLX-LM 107 | - Rich 108 | - Requests 109 | - Hugging Face Hub -------------------------------------------------------------------------------- /llmx/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | LLMX CLI implementation 3 | """ 4 | import os 5 | import json 6 | import argparse 7 | from pathlib import Path 8 | import subprocess 9 | import signal 10 | import requests 11 | from rich.console import Console 12 | from rich.table import Table 13 | from rich.markdown import Markdown 14 | from rich.prompt import Prompt, Confirm 15 | from rich.panel import Panel 16 | from huggingface_hub import snapshot_download 17 | from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE 18 | 19 | console = Console() 20 | LLMX_HOME = os.path.expanduser("~/.llmx/running.json") 21 | os.makedirs(os.path.dirname(LLMX_HOME), exist_ok=True) 22 | 23 | def manage_running_models(action='load', port=None, server_info=None): 24 | """Unified function to manage running models state""" 25 | try: 26 | running_models = {} 27 | if os.path.exists(LLMX_HOME): 28 | with open(LLMX_HOME, 'r') as f: 29 | running_models = json.load(f) 30 | 31 | # Clean up stale entries 32 | running_models = { 33 | p: info for p, info in running_models.items() 34 | if is_process_running(info.get("pid")) 35 | } 36 | 37 | if action == 'load': 38 | return running_models 39 | elif action == 'save': 40 | if server_info: 41 | running_models[port] = server_info 42 | elif port in running_models: 43 | del running_models[port] 44 | with open(LLMX_HOME, 'w') as f: 45 | json.dump(running_models, f) 46 | return running_models 47 | except (json.JSONDecodeError, IOError): 48 | return {} 49 | 50 | def is_process_running(pid): 51 | """Check if a process is running""" 52 | try: 53 | os.kill(pid, 0) 54 | return True 55 | except (ProcessLookupError, TypeError): 56 | return False 57 | 58 | def manage_server(action, model_id=None, port=None): 59 | """Unified function to manage model servers""" 60 | running_models = manage_running_models('load') 61 | 62 | if action == 'start': 63 | if str(port) in running_models: 64 | console.print(f"[red]Port {port} is already in use[/red]") 65 | return None 66 | 67 | try: 68 | cmd = f"mlx_lm.server --model {model_id} --port {port}" 69 | process = subprocess.Popen(cmd.split(), start_new_session=True) 70 | server_info = {"model_id": model_id, "pid": process.pid, "port": port} 71 | manage_running_models('save', str(port), server_info) 72 | console.print(f"[green]Started model server for {model_id} on port {port}[/green]") 73 | return server_info 74 | except Exception as e: 75 | console.print(f"[red]Error starting server: {str(e)}[/red]") 76 | return None 77 | 78 | elif action == 'stop': 79 | try: 80 | os.killpg(os.getpgid(running_models[str(port)]["pid"]), signal.SIGTERM) 81 | manage_running_models('save', str(port)) 82 | console.print(f"[green]Stopped server on port {port}[/green]") 83 | return True 84 | except (KeyError, ProcessLookupError): 85 | console.print(f"[red]No server running on port {port}[/red]") 86 | return False 87 | 88 | def chat_session(port, temperature=0.7): 89 | """Run an interactive chat session""" 90 | messages = [] 91 | try: 92 | while True: 93 | try: 94 | user_input = Prompt.ask("[bold green]You[/bold green]") 95 | if user_input.lower() == 'exit': 96 | break 97 | 98 | messages.append({"role": "user", "content": user_input}) 99 | console.print(f"\n[bold purple]Assistant[/bold purple]") 100 | 101 | # Stream the response 102 | current_message = [] 103 | with requests.post( 104 | f"http://localhost:{port}/v1/chat/completions", 105 | json={"messages": messages, "temperature": temperature, "stream": True}, 106 | headers={"Content-Type": "application/json"}, 107 | timeout=30, 108 | stream=True 109 | ) as response: 110 | if response.status_code != 200: 111 | console.print(f"[red]Error: Server returned status code {response.status_code}[/red]") 112 | continue 113 | 114 | for line in response.iter_lines(): 115 | if not line: 116 | continue 117 | try: 118 | data = json.loads(line.decode('utf-8').removeprefix('data: ')) 119 | if data.get("choices"): 120 | chunk = data["choices"][0].get("delta", {}).get("content", "") 121 | if chunk: 122 | current_message.append(chunk) 123 | console.print(chunk, end="") 124 | except json.JSONDecodeError: 125 | continue 126 | 127 | console.print() # New line after response 128 | assistant_message = "".join(current_message) 129 | messages.append({"role": "assistant", "content": assistant_message}) 130 | 131 | except (KeyboardInterrupt, requests.exceptions.RequestException) as e: 132 | console.print(f"\n[red]Error: {str(e)}[/red]") 133 | break 134 | finally: 135 | return Confirm.ask("\nKeep server running?", default=False) 136 | 137 | def main(): 138 | parser = argparse.ArgumentParser(description="LLMX - MLX-LM model management tool", add_help=False) 139 | subparsers = parser.add_subparsers(dest='command') 140 | 141 | # Add command parsers 142 | for cmd, help_text in { 143 | 'serve': 'Start a model server', 144 | 'chat': 'Start an interactive chat session', 145 | 'stop': 'Stop a running model', 146 | 'pull': 'Pull a model from Hugging Face', 147 | 'list': 'List downloaded models', 148 | 'ps': 'List running models', 149 | 'help': 'Show help message' 150 | }.items(): 151 | cmd_parser = subparsers.add_parser(cmd, add_help=False, help=help_text) 152 | if cmd in ['serve', 'chat']: 153 | cmd_parser.add_argument('model_id') 154 | cmd_parser.add_argument('--port', type=int, default=8080) 155 | if cmd == 'chat': 156 | cmd_parser.add_argument('--temperature', type=float, default=0.7) 157 | elif cmd == 'stop': 158 | cmd_parser.add_argument('port') 159 | elif cmd == 'pull': 160 | cmd_parser.add_argument('model_id') 161 | 162 | args = parser.parse_args() 163 | running_models = manage_running_models('load') 164 | 165 | if args.command in ['serve', 'chat']: 166 | try: 167 | model_path = snapshot_download( 168 | repo_id=args.model_id, 169 | ignore_patterns=["*.safetensors", "*.bin"] 170 | ) 171 | if not model_path: 172 | return 173 | 174 | port = int(getattr(args, 'port', 8080)) 175 | if server_info := manage_server('start', args.model_id, port): 176 | if args.command == 'chat': 177 | import time 178 | time.sleep(2) # Give server time to start 179 | if not chat_session(port, getattr(args, 'temperature', 0.7)): 180 | manage_server('stop', port=port) 181 | 182 | except Exception as e: 183 | console.print(f"[red]Error: {str(e)}[/red]") 184 | 185 | elif args.command == 'stop': 186 | manage_server('stop', port=args.port) 187 | 188 | elif args.command == 'pull': 189 | try: 190 | snapshot_download( 191 | repo_id=args.model_id, 192 | ignore_patterns=["*.safetensors", "*.bin"] 193 | ) 194 | except Exception as e: 195 | console.print(f"[red]Error pulling model: {str(e)}[/red]") 196 | 197 | elif args.command == 'list': 198 | table = Table(show_header=True, header_style="bold magenta") 199 | table.add_column("Model ID", style="green") 200 | table.add_column("Size", justify="right") 201 | 202 | for model_dir in Path(HUGGINGFACE_HUB_CACHE).glob("models--*"): 203 | try: 204 | parts = model_dir.name.split('--') 205 | model_id = f"{parts[1]}/{parts[2]}" if len(parts) == 3 else parts[1] 206 | size = sum(f.stat(follow_symlinks=False).st_size for f in model_dir.glob('**/*') if f.is_file()) 207 | table.add_row(model_id, f"{size / 1024 / 1024:.1f} MB") 208 | except Exception: 209 | continue 210 | console.print(table) 211 | 212 | elif args.command == 'ps': 213 | if not running_models: 214 | console.print("[yellow]No running models[/yellow]") 215 | return 216 | 217 | table = Table(show_header=True, header_style="bold magenta") 218 | table.add_column("Port") 219 | table.add_column("Model ID") 220 | table.add_column("Status") 221 | 222 | for port, info in running_models.items(): 223 | status = "[green]Running[/green]" if is_process_running(info["pid"]) else "[red]Stopped[/red]" 224 | table.add_row(str(port), info["model_id"], status) 225 | console.print(table) 226 | 227 | else: 228 | help_text = """ 229 | [bold]llmx Commands:[/bold] 230 | 231 | [green]serve[/green] [--port PORT] Start a model server 232 | [green]chat[/green] [--port PORT] Start an interactive chat session 233 | [green]stop[/green] Stop a running model 234 | [green]pull[/green] Pull a model from Hugging Face 235 | [green]list[/green] List downloaded models 236 | [green]ps[/green] List running models 237 | [green]help[/green] Show this help message 238 | 239 | [bold]Example:[/bold] llmx chat mlx-community/Mistral-7B-Instruct-v0.3-4bit --temperature 0.7 240 | """ 241 | console.print(Panel(help_text, title="LLMX Help", border_style="blue")) 242 | 243 | if __name__ == '__main__': 244 | main() --------------------------------------------------------------------------------