├── .gitignore ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── config ├── Aquila-7B.json ├── BAAI_namespace.json ├── Baichuan-7B.json ├── Baichuan2-13B-Chat-4bits.json ├── Baichuan2-7B-Chat.json ├── DeciLM-7B-instruct.json ├── DeciLM-7B.json ├── Deci_namespace.json ├── DeepSeek-R1-Distill-Llama-8B.json ├── DeepSeek-R1-Distill-Qwen-1.5B.json ├── DeepSeek-R1-Distill-Qwen-7B.json ├── EXAONE-3.0-7.8B-Instruct copy.json ├── EXAONE-3.0-7.8B-Instruct.json ├── EleutherAI_namespace.json ├── Llama-2-13b-hf.json ├── Llama-3.2-3B-Instruct.json ├── Llama-3.2-3B-Instruct_2gpu.json ├── Meta-Llama-3-8B-Instruct.json ├── Meta-Llama-3-8B.json ├── MiniCPM-2B-dpo-bf16.json ├── MiniCPM-2B-sft-bf16.json ├── MiniCPM3-4B.json ├── Minitron-8B-Base.json ├── Mistral-7B-Instruct-v0.1.json ├── Mistral-7B-v0.1.json ├── Mistral-7B-v0.1_2gpu.json ├── OLMo-1B-hf.json ├── OLMo-1B-hf_2gpu.json ├── OLMo-7B-hf.json ├── OLMoE-1B-7B-0924-Instruct.json ├── OLMoE-1B-7B-0924.json ├── OpenAssistant_namespace.json ├── Phi-3-mini-128k-instruct.json ├── Phi-3-mini-4k-instruct.json ├── Qwen-VL-Chat.json ├── Qwen.json ├── Qwen1.5-MoE-A2.7B.json ├── Qwen2.5-1.5B.json ├── Qwen2.5-7B-Instruct-1M.json ├── Qwen2.5-7B-Instruct-GPTQ-Int8.json ├── Qwen2.5-7B.json ├── Qwen2.5-Coder-1.5B-Instruct.json ├── Qwen2.5-Coder-14B-Instruct-GPTQ-Int8.json ├── Qwen2.5-Coder-3B.json ├── Qwen2.5-Coder-7B-Instruct.json ├── Qwen2.5-Math-1.5B-Instruct.json ├── Qwen2.5-Math-1.5B.json ├── Qwen2.5-Math-7B-Instruct.json ├── Qwen2.5-Math-7B.json ├── Qwen7BInt8.json ├── Qwen_namespace.json ├── Salesforce_namespace.json ├── THUDM_namespace.json ├── TinyLlama-1.1B-Chat-v1.0.json ├── TinyLlama-1.1B-Chat-v1.0_13GB.json ├── TinyLlama-1.1B-Chat-v1.0_2gpu.json ├── TinyLlama-1.1B-Chat-v1.0_test.json ├── TinyLlama_namespace.json ├── XVERSE-13B-Chat.json ├── XVERSE-7B-Chat.json ├── allenai_namespace.json ├── baichuan-inc_namespace.json ├── bigcode_namespace.json ├── chatglm3-6b-128k.json ├── chatglm3-6b-32k.json ├── chatglm3-6b.json ├── codegen-2B-multi.json ├── core42_jais-13b-bnb-4bit.json ├── core42_jais-13b-chat-bnb-4bit.json ├── databricks_namespace.json ├── deepseek-ai_namespace.json ├── deepseek-llm-7b-chat.json ├── deepseek-llm-7b-chat_2gpu.json ├── deepseek-math-7b-instruct.json ├── deepseek-math-7b-rl.json ├── deepseek-vl2-tiny.json ├── dolly-v2-12b.json ├── facebook_namespace.json ├── falcon-7b.json ├── falcon-rw-7b.json ├── gemma-7b.json ├── gpt-j-6b.json ├── gpt2-xl.json ├── gpt4all-j.json ├── internlm2-7b.json ├── internlm2_5-7b-chat.json ├── llama_8BInt8.json ├── llava-1.5-7b-hf.json ├── llava-hf_namespace.json ├── mamba-1.4b-hf.json ├── mamba-2.8b-hf.json ├── meta-llama_namespace.json ├── microsoft_namespace.json ├── mistral.json ├── mistralai_namespace.json ├── models.txt ├── mosaicml_namespace.json ├── mpt-7b-storywriter.json ├── mpt-7b.json ├── namespace1.json ├── nomic-ai_namespace.json ├── ns1_namespace.json ├── oasst-sft-4-pythia-12b-epoch-3.5.json ├── openai-community_namespace.json ├── openbmb_namespace.json ├── opt-iml-max-1.3b.json ├── persimmon-8b-base.json ├── persimmon-8b-chat.json ├── public.json ├── pythia-12b.json ├── reader.json ├── stabilityai_namespace.json ├── stable-diffusion-xl-base-1.0.json ├── stablelm-3b-4e1t.json ├── stablelm-tuned-alpha-7b.json ├── starcoder2-3b.json ├── starcoder2-7b.json ├── state-spaces_namespace.json ├── tenant1.json └── tiiuae_namespace.json ├── dashboard ├── Makefile ├── __pycache__ │ ├── na_pb2.cpython-38.pyc │ ├── na_pb2_grpc.cpython-38.pyc │ ├── qobjs_pb2.cpython-38.pyc │ └── qobjs_pb2_grpc.cpython-38.pyc ├── app.py ├── client.py ├── doc ├── gunicorn.conf.py ├── na_pb2.py ├── na_pb2_grpc.py ├── nginx.conf ├── qobjs_pb2.py ├── qobjs_pb2_grpc.py ├── requirements.txt ├── sql │ ├── audit.sql │ ├── create_table.sql │ ├── kv.sql │ └── secret.sql ├── static │ └── button.gif └── templates │ ├── admin.html │ ├── base.html │ ├── func.html │ ├── func_list.html │ ├── index.html │ ├── log.html │ ├── markdown.html │ ├── node.html │ ├── node_list.html │ ├── pod.html │ ├── pod_list.html │ └── snapshot_list.html ├── deployment ├── dashboard.Dockerfile ├── llava.Dockerfile ├── one.Dockerfile ├── spdk.Dockerfile ├── spdk.script ├── spdk2.Dockerfile └── vllm-opai.Dockerfile ├── doc ├── GPUSnapshot.png ├── architect.png ├── comparison.png ├── daemon.json ├── home.md ├── infer_Profile.png ├── keycloak.md ├── logo.png ├── logo1.png └── logo2.png ├── docker-compose.yml ├── docker-compose_blob.yml ├── inferx-realm.json ├── inferxlib ├── Cargo.toml └── src │ ├── common.rs │ ├── data_obj.rs │ ├── lib.rs │ ├── node.rs │ ├── obj_mgr │ ├── cidrlock.rs │ ├── func_mgr.rs │ ├── funcsnapshot_mgr.rs │ ├── mod.rs │ ├── namespace_mgr.rs │ ├── node_mgr.rs │ ├── pod_mgr.rs │ └── tenant_mgr.rs │ ├── resource.rs │ ├── selector.rs │ └── validation.rs ├── ixctl ├── command.rs ├── create.rs ├── delete.rs ├── get.rs ├── list.rs ├── main.rs ├── object_client.rs └── update.rs ├── ixctl_logging_config.yaml ├── k8s ├── clean-k3sagent.sh ├── cleanup-k3s.sh ├── dashboard.yaml ├── db-deployment.yaml ├── etcd.yaml ├── inferx_one.yaml ├── inferx_one_blob.yaml ├── ingress.yaml ├── install-k3s.sh ├── join-k3sagent.sh ├── keycloak.yaml ├── keycloak_postgres.yaml ├── nodeagent.yaml ├── nvidia-test.yaml ├── scheduler.yaml ├── secretdb.yaml ├── spdk.yaml └── statesvc.yaml ├── nodeconfig ├── node.json ├── node1.json ├── node2.json ├── node3.json ├── node4.json └── node_blob.json └── script ├── inferx_clean.sh ├── run_llava.py ├── run_model.py └── run_stablediffusion.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk 12 | 13 | # MSVC Windows builds of rustc generate these, which store debugging information 14 | *.pdb 15 | 16 | # RustRover 17 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 18 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 19 | # and can be added to the global gitignore or merged into this file. For a more nuclear 20 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 21 | #.idea/ -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "qservice" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | inferxlib = { path = "./inferxlib" } 10 | 11 | libc = "0.2.94" 12 | tokio = { version = "1.25", features = ["full"] } 13 | tokio-stream = { version = "0.1", features = ["net"] } 14 | tonic = { version = "0.8" } 15 | hostname = "^0.3" 16 | rand = "0.8.5" 17 | serde = { version = "1.0", features = ["derive"] } 18 | serde_json = "1.0" 19 | serde_derive = "1.0" 20 | regex = "1.7.1" 21 | reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } 22 | chrono = "0.4.24" 23 | tower = "0.4.13" 24 | k8s-openapi = { version = "0.18.0", features = ["v1_26"] } 25 | simple-logging = "2.0.2" 26 | log = "0.4.17" 27 | log4rs = "1" 28 | const_format = "0.2.30" 29 | local-ip-address = "0.5.1" 30 | once_cell = "1.17.1" 31 | ipnetwork = "0.20.0" 32 | scopeguard = { version = "^1.1.0", default-features = false } 33 | errno = "0.2.4" 34 | nix = "0.23.1" 35 | futures = "0.3" 36 | dns-lookup = "2.0.4" 37 | #clap = "4.5.9" 38 | clap = "2.33.3" 39 | oauth2 = "4.0" 40 | 41 | axum = "0.7.4" 42 | hyper = { version = "1.3.1", features = ["full"] } 43 | hyper-util = { version = "0.1.3", features = ["full"] } 44 | http-body-util = "0.1" 45 | backtrace = "0.3.74" 46 | 47 | [dependencies.lazy_static] 48 | version = "1.0" 49 | features = ["spin_no_std"] 50 | 51 | [dependencies.uuid] 52 | version = "1.3.1" 53 | features = [ 54 | "v4", # Lets you generate random UUIDs 55 | "fast-rng", # Use a faster (but still sufficiently random) RNG 56 | "macro-diagnostics", # Enable better diagnostics for compile-time UUIDs 57 | ] 58 | 59 | [[bin]] 60 | name = "ixctl" 61 | path = "ixctl/main.rs" 62 | -------------------------------------------------------------------------------- /config/Aquila-7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "BAAI", 5 | "name": "Aquila-7B", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "BAAI/Aquila-7B", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 60000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13000 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ], 32 | [ 33 | "VLLM_CUDART_SO_PATH", 34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 35 | ] 36 | ], 37 | "mounts": [ 38 | { 39 | "hostpath": "/home/brad/cache", 40 | "mountpath": "/root/.cache/huggingface" 41 | } 42 | ], 43 | "endpoint": { 44 | "port": 8000, 45 | "schema": "Http", 46 | "probe": "/health" 47 | }, 48 | "sample_query": { 49 | "apiType": "openai", 50 | "prompt": "Here is a recipe for vegan banana bread:", 51 | "path": "v1/completions", 52 | "body": { 53 | "model": "BAAI/Aquila-7B", 54 | "max_tokens": "1000", 55 | "temperature": "0", 56 | "stream": "true" 57 | } 58 | }, 59 | "standby": { 60 | "gpu": "Blob", 61 | "pageable": "Blob", 62 | "pinned": "Blob" 63 | } 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /config/BAAI_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "BAAI", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/Baichuan-7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "baichuan-inc", 5 | "name": "Baichuan-7B", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "baichuan-inc/Baichuan-7B", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "1200", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 60000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13800 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ], 32 | [ 33 | "VLLM_CUDART_SO_PATH", 34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 35 | ] 36 | ], 37 | "mounts": [ 38 | { 39 | "hostpath": "/home/brad/cache", 40 | "mountpath": "/root/.cache/huggingface" 41 | } 42 | ], 43 | "endpoint": { 44 | "port": 8000, 45 | "schema": "Http", 46 | "probe": "/health" 47 | }, 48 | "sample_query": { 49 | "apiType": "openai", 50 | "prompt": "Give me a short introduction to large language model.", 51 | "path": "v1/completions", 52 | "body": { 53 | "model": "baichuan-inc/Baichuan-7B", 54 | "max_tokens": "1000", 55 | "temperature": "0", 56 | "stream": "true" 57 | } 58 | }, 59 | "standby": { 60 | "gpu": "Blob", 61 | "pageable": "Blob", 62 | "pinned": "Blob" 63 | } 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /config/Baichuan2-13B-Chat-4bits.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "baichuan-inc", 5 | "name": "Baichuan2-13B-Chat-4bits", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "baichuan-inc/Baichuan2-13B-Chat-4bits", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000", 15 | "--trust-remote-code" 16 | ], 17 | "resources": { 18 | "CPU": 12000, 19 | "Mem": 24000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 13800 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ], 31 | [ 32 | "VLLM_CUDART_SO_PATH", 33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 34 | ] 35 | ], 36 | "mounts": [ 37 | { 38 | "hostpath": "/home/brad/cache", 39 | "mountpath": "/root/.cache/huggingface" 40 | } 41 | ], 42 | "endpoint": { 43 | "port": 8000, 44 | "schema": "Http", 45 | "probe": "/health" 46 | }, 47 | "sample_query": { 48 | "apiType": "openai", 49 | "prompt": "解释一下'温故而知新'", 50 | "path": "v1/completions", 51 | "body": { 52 | "model": "baichuan-inc/Baichuan2-13B-Chat-4bits", 53 | "max_tokens": "1000", 54 | "temperature": "0", 55 | "stream": "true" 56 | } 57 | }, 58 | "standby": { 59 | "gpu": "Blob", 60 | "pageable": "Blob", 61 | "pinned": "Blob" 62 | } 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /config/DeciLM-7B-instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Deci", 5 | "name": "DeciLM-7B-instruct", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Deci/DeciLM-7B-instruct", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13000 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "port": 8000, 41 | "schema": "Http", 42 | "probe": "/health" 43 | }, 44 | "sample_query": { 45 | "apiType": "openai", 46 | "prompt": "Here is a recipe for vegan banana bread:", 47 | "path": "v1/completions", 48 | "body": { 49 | "model": "Deci/DeciLM-7B-instruct", 50 | "max_tokens": "1000", 51 | "temperature": "0", 52 | "stream": "true" 53 | } 54 | }, 55 | "standby": { 56 | "gpu": "Blob", 57 | "pageable": "Blob", 58 | "pinned": "Blob" 59 | } 60 | } 61 | } 62 | } -------------------------------------------------------------------------------- /config/DeciLM-7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Deci", 5 | "name": "DeciLM-7B", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Deci/DeciLM-7B", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "1200", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13000 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "port": 8000, 41 | "schema": "Http", 42 | "probe": "/health" 43 | }, 44 | "sample_query": { 45 | "apiType": "openai", 46 | "prompt": "Here is a recipe for vegan banana bread:", 47 | "path": "v1/completions", 48 | "body": { 49 | "model": "Deci/DeciLM-7B", 50 | "max_tokens": "1000", 51 | "temperature": "0", 52 | "stream": "true" 53 | } 54 | }, 55 | "standby": { 56 | "gpu": "Blob", 57 | "pageable": "Blob", 58 | "pinned": "Blob" 59 | } 60 | } 61 | } 62 | } -------------------------------------------------------------------------------- /config/Deci_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "Deci", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/EXAONE-3.0-7.8B-Instruct copy.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "gemma-7b", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "google/gemma-7b", 11 | "--enforce-eager", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 6000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 15000 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "path": "/v1/completions", 41 | "port": 8000, 42 | "schema": "Http" 43 | }, 44 | "probe": { 45 | "path": "/health", 46 | "port": 8000, 47 | "schema": "Http" 48 | }, 49 | "api_type": { 50 | "openai": { 51 | "name": "google/gemma-7b", 52 | "max_tokens": 1000, 53 | "temperature": 0 54 | } 55 | }, 56 | "keepalive": "Blob" 57 | } 58 | } -------------------------------------------------------------------------------- /config/EXAONE-3.0-7.8B-Instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "EXAONE-3.0-7.8B-Instruct", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", 11 | "--enforce-eager", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 6000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 15000 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "path": "/v1/completions", 41 | "port": 8000, 42 | "schema": "Http" 43 | }, 44 | "probe": { 45 | "path": "/health", 46 | "port": 8000, 47 | "schema": "Http" 48 | }, 49 | "api_type": { 50 | "openai": { 51 | "name": "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", 52 | "max_tokens": 1000, 53 | "temperature": 0 54 | } 55 | }, 56 | "keepalive": "Blob" 57 | } 58 | } -------------------------------------------------------------------------------- /config/EleutherAI_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "EleutherAI", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/Llama-3.2-3B-Instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "meta-llama", 5 | "name": "Llama-3.2-3B-Instruct", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "meta-llama/Llama-3.2-3B-Instruct", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "200" 16 | ], 17 | "resources": { 18 | "CPU": 20000, 19 | "Mem": 50000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 14600 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ], 31 | [ 32 | "VLLM_CUDART_SO_PATH", 33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 34 | ] 35 | ], 36 | "mounts": [ 37 | { 38 | "hostpath": "/home/brad/cache", 39 | "mountpath": "/root/.cache/huggingface" 40 | } 41 | ], 42 | "endpoint": { 43 | "port": 8000, 44 | "schema": "Http", 45 | "probe": "/health" 46 | }, 47 | "sample_query": { 48 | "apiType": "openai", 49 | "prompt": "def print_hello_world():", 50 | "path": "v1/completions", 51 | "body": { 52 | "model": "meta-llama/Llama-3.2-3B-Instruct", 53 | "max_tokens": "120", 54 | "temperature": "0", 55 | "stream": "true" 56 | } 57 | }, 58 | "standby": { 59 | "gpu": "Blob", 60 | "pageable": "Blob", 61 | "pinned": "Blob" 62 | } 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /config/Llama-3.2-3B-Instruct_2gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "meta-llama", 5 | "name": "Llama-3.2-3B-Instruct_2gpu", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "meta-llama/Llama-3.2-3B-Instruct", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "1000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 14600 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "port": 8000, 41 | "schema": "Http", 42 | "probe": "/health" 43 | }, 44 | "sample_query": { 45 | "apiType": "openai", 46 | "prompt": "def print_hello_world():", 47 | "path": "v1/completions", 48 | "body": { 49 | "model": "meta-llama/Llama-3.2-3B-Instruct", 50 | "max_tokens": "120", 51 | "temperature": "0", 52 | "stream": "true" 53 | } 54 | }, 55 | "standby": { 56 | "gpu": "Blob", 57 | "pageable": "Blob", 58 | "pinned": "Blob" 59 | } 60 | } 61 | } 62 | } -------------------------------------------------------------------------------- /config/Meta-Llama-3-8B-Instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "Meta-Llama-3-8B-Instruct", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "meta-llama/Meta-Llama-3-8B-Instruct", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "meta-llama/Meta-Llama-3-8B-Instruct", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/Meta-Llama-3-8B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "Meta-Llama-3-8B", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "meta-llama/Meta-Llama-3-8B", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "meta-llama/Meta-Llama-3-8B", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/MiniCPM-2B-dpo-bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "openbmb", 5 | "name": "MiniCPM-2B-dpo-bf16", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "openbmb/MiniCPM-2B-dpo-bf16", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000" 16 | ], 17 | "resources": { 18 | "CPU": 12000, 19 | "Mem": 28000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 13800 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ], 31 | [ 32 | "VLLM_CUDART_SO_PATH", 33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 34 | ] 35 | ], 36 | "mounts": [ 37 | { 38 | "hostpath": "/home/brad/cache", 39 | "mountpath": "/root/.cache/huggingface" 40 | } 41 | ], 42 | "endpoint": { 43 | "port": 8000, 44 | "schema": "Http", 45 | "probe": "/health" 46 | }, 47 | "sample_query": { 48 | "apiType": "openai", 49 | "prompt": "Give me a short introduction to large language model.", 50 | "path": "v1/completions", 51 | "body": { 52 | "model": "openbmb/MiniCPM-2B-dpo-bf16", 53 | "max_tokens": "1000", 54 | "temperature": "0", 55 | "stream": "true" 56 | } 57 | }, 58 | "standby": { 59 | "gpu": "Blob", 60 | "pageable": "Blob", 61 | "pinned": "Blob" 62 | } 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /config/MiniCPM-2B-sft-bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "openbmb", 5 | "name": "MiniCPM-2B-sft-bf16", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "openbmb/MiniCPM-2B-sft-bf16", 12 | "--trust-remote-code", 13 | "--max-model-len", 14 | "1200" 15 | ], 16 | "resources": { 17 | "CPU": 12000, 18 | "Mem": 24000, 19 | "GPU": { 20 | "Type": "Any", 21 | "Count": 1, 22 | "vRam": 9000 23 | } 24 | }, 25 | "envs": [ 26 | [ 27 | "LD_LIBRARY_PATH", 28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 29 | ], 30 | [ 31 | "VLLM_CUDART_SO_PATH", 32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 33 | ] 34 | ], 35 | "mounts": [ 36 | { 37 | "hostpath": "/home/brad/cache", 38 | "mountpath": "/root/.cache/huggingface" 39 | } 40 | ], 41 | "endpoint": { 42 | "port": 8000, 43 | "schema": "Http", 44 | "probe": "/health" 45 | }, 46 | "sample_query": { 47 | "apiType": "openai", 48 | "prompt": "Give me a short introduction to large language model.", 49 | "path": "v1/completions", 50 | "body": { 51 | "model": "openbmb/MiniCPM-2B-sft-bf16", 52 | "max_tokens": "1000", 53 | "temperature": "0", 54 | "stream": "true" 55 | } 56 | }, 57 | "standby": { 58 | "gpu": "Blob", 59 | "pageable": "Blob", 60 | "pinned": "Blob" 61 | } 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /config/MiniCPM3-4B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "openbmb", 5 | "name": "MiniCPM3-4B", 6 | "object": { 7 | "spec": { 8 | "image": "vllm-openai-upgraded:v.0.1", 9 | "commands": [ 10 | "--model", 11 | "openbmb/MiniCPM3-4B", 12 | "--enforce-eager", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "200" 16 | ], 17 | "resources": { 18 | "CPU": 12000, 19 | "Mem": 24000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 9000 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "port": 8000, 40 | "schema": "Http", 41 | "probe": "/health" 42 | }, 43 | "sample_query": { 44 | "apiType": "openai", 45 | "prompt": "推荐5个北京的景点。", 46 | "path": "v1/completions", 47 | "body": { 48 | "model": "openbmb/MiniCPM3-4B", 49 | "max_tokens": "100", 50 | "temperature": "0", 51 | "stream": "true" 52 | } 53 | }, 54 | "standby": { 55 | "gpu": "Blob", 56 | "pageable": "Blob", 57 | "pinned": "Blob" 58 | } 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /config/Minitron-8B-Base.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "Minitron-8B-Base", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "nvidia/Minitron-8B-Base", 11 | "--enforce-eager", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 6000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13800 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/Quark/target/debug/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "path": "/v1/completions", 41 | "port": 8000, 42 | "schema": "Http" 43 | }, 44 | "probe": { 45 | "path": "/health", 46 | "port": 8000, 47 | "schema": "Http" 48 | }, 49 | "api_type": { 50 | "openai": { 51 | "name": "nvidia/Minitron-8B-Base", 52 | "max_tokens": 1000, 53 | "temperature": 0 54 | } 55 | }, 56 | "keepalive": "Blob" 57 | } 58 | } -------------------------------------------------------------------------------- /config/Mistral-7B-Instruct-v0.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "Mistral-7B-Instruct-v0.1", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "mistralai/Mistral-7B-Instruct-v0.1", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "mistralai/Mistral-7B-Instruct-v0.1", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/Mistral-7B-v0.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "mistralai", 5 | "name": "Mistral-7B-v0.1", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "mistralai/Mistral-7B-v0.1", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--gpu-memory-utilization", 15 | "0.99", 16 | "--max-model-len", 17 | "200" 18 | ], 19 | "resources": { 20 | "CPU": 20000, 21 | "Mem": 30000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 1, 25 | "vRam": 14800 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "port": 8000, 42 | "schema": "Http", 43 | "probe": "/health" 44 | }, 45 | "sample_query": { 46 | "apiType": "openai", 47 | "prompt": "I like traveling by train because", 48 | "path": "v1/completions", 49 | "body": { 50 | "model": "mistralai/Mistral-7B-v0.1", 51 | "max_tokens": "180", 52 | "temperature": "0", 53 | "stream": "true" 54 | } 55 | }, 56 | "standby": { 57 | "gpu": "Blob", 58 | "pageable": "Blob", 59 | "pinned": "Blob" 60 | } 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /config/OLMo-1B-hf.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "allenai", 5 | "name": "OLMo-1B-hf", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "allenai/OLMo-1B-hf", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000" 15 | ], 16 | "resources": { 17 | "CPU": 12000, 18 | "Mem": 50000, 19 | "GPU": { 20 | "Type": "Any", 21 | "Count": 1, 22 | "vRam": 14600 23 | } 24 | }, 25 | "envs": [ 26 | [ 27 | "LD_LIBRARY_PATH", 28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 29 | ], 30 | [ 31 | "VLLM_CUDART_SO_PATH", 32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 33 | ] 34 | ], 35 | "mounts": [ 36 | { 37 | "hostpath": "/home/brad/cache", 38 | "mountpath": "/root/.cache/huggingface" 39 | } 40 | ], 41 | "endpoint": { 42 | "port": 8000, 43 | "schema": "Http", 44 | "probe": "/health" 45 | }, 46 | "sample_query": { 47 | "apiType": "openai", 48 | "prompt": "What is the capital of USA?", 49 | "path": "v1/completions", 50 | "body": { 51 | "model": "allenai/OLMo-1B-hf", 52 | "max_tokens": "1000", 53 | "temperature": "0", 54 | "stream": "true" 55 | } 56 | }, 57 | "standby": { 58 | "gpu": "Blob", 59 | "pageable": "Blob", 60 | "pinned": "Blob" 61 | } 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /config/OLMo-1B-hf_2gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "allenai", 5 | "name": "OLMo-1B-hf_2gpu", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "allenai/OLMo-1B-hf", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000", 15 | "--tensor-parallel-size=2" 16 | ], 17 | "resources": { 18 | "CPU": 12000, 19 | "Mem": 50000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 2, 23 | "vRam": 14600 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "port": 8000, 40 | "schema": "Http", 41 | "probe": "/health" 42 | }, 43 | "sample_query": { 44 | "apiType": "openai", 45 | "prompt": "What is the capital of USA?", 46 | "path": "v1/completions", 47 | "body": { 48 | "model": "allenai/OLMo-1B-hf", 49 | "max_tokens": "1000", 50 | "temperature": "0", 51 | "stream": "true" 52 | } 53 | }, 54 | "standby": { 55 | "gpu": "Blob", 56 | "pageable": "Blob", 57 | "pinned": "Blob" 58 | } 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /config/OLMo-7B-hf.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "allenai", 5 | "name": "OLMo-7B-hf", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "allenai/OLMo-7B-hf", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000", 15 | "--tensor-parallel-size=2" 16 | ], 17 | "resources": { 18 | "CPU": 20000, 19 | "Mem": 70000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 2, 23 | "vRam": 13800 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ], 31 | [ 32 | "VLLM_CUDART_SO_PATH", 33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 34 | ] 35 | ], 36 | "mounts": [ 37 | { 38 | "hostpath": "/home/brad/cache", 39 | "mountpath": "/root/.cache/huggingface" 40 | } 41 | ], 42 | "endpoint": { 43 | "port": 8000, 44 | "schema": "Http", 45 | "probe": "/health" 46 | }, 47 | "sample_query": { 48 | "apiType": "openai", 49 | "prompt": "What is the capital of USA?", 50 | "path": "v1/completions", 51 | "body": { 52 | "model": "allenai/OLMo-7B-hf", 53 | "max_tokens": "1000", 54 | "temperature": "0", 55 | "stream": "true" 56 | } 57 | }, 58 | "standby": { 59 | "gpu": "Blob", 60 | "pageable": "Blob", 61 | "pinned": "Blob" 62 | } 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /config/OLMoE-1B-7B-0924-Instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "OLMoE-1B-7B-0924-Instruct", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "allenai/OLMoE-1B-7B-0924-Instruct", 11 | "--disable-custom-all-reduce", 12 | "--trust-remote-code", 13 | "--max-model-len", 14 | "2000", 15 | "--tensor-parallel-size=2" 16 | ], 17 | "resources": { 18 | "CPU": 6000, 19 | "Mem": 50000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 2, 23 | "vRam": 13800 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/Quark/target/debug/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "path": "/v1/completions", 40 | "port": 8000, 41 | "schema": "Http" 42 | }, 43 | "probe": { 44 | "path": "/health", 45 | "port": 8000, 46 | "schema": "Http" 47 | }, 48 | "api_type": { 49 | "openai": { 50 | "name": "allenai/OLMoE-1B-7B-0924-Instruct", 51 | "max_tokens": 1000, 52 | "temperature": 0 53 | } 54 | }, 55 | "keepalive": "Blob" 56 | } 57 | } -------------------------------------------------------------------------------- /config/OLMoE-1B-7B-0924.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "OLMoE-1B-7B-0924", 6 | "spec": { 7 | "image": "vllm-openai-upgraded:v.0.1", 8 | "commands": [ 9 | "--model", 10 | "allenai/OLMoE-1B-7B-0924", 11 | "--enforce-eager", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 6000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13800 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/Quark/target/debug/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "path": "/v1/completions", 41 | "port": 8000, 42 | "schema": "Http" 43 | }, 44 | "probe": { 45 | "path": "/health", 46 | "port": 8000, 47 | "schema": "Http" 48 | }, 49 | "api_type": { 50 | "openai": { 51 | "name": "allenai/OLMoE-1B-7B-0924", 52 | "max_tokens": 1000, 53 | "temperature": 0 54 | } 55 | }, 56 | "keepalive": "Blob" 57 | } 58 | } -------------------------------------------------------------------------------- /config/OpenAssistant_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "OpenAssistant", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/Phi-3-mini-128k-instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "microsoft", 5 | "name": "Phi-3-mini-128k-instruct", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "microsoft/Phi-3-mini-128k-instruct", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000" 16 | ], 17 | "resources": { 18 | "CPU": 12000, 19 | "Mem": 24000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 13000 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ], 31 | [ 32 | "VLLM_CUDART_SO_PATH", 33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 34 | ] 35 | ], 36 | "mounts": [ 37 | { 38 | "hostpath": "/home/brad/cache", 39 | "mountpath": "/root/.cache/huggingface" 40 | } 41 | ], 42 | "endpoint": { 43 | "port": 8000, 44 | "schema": "Http", 45 | "probe": "/health" 46 | }, 47 | "sample_query": { 48 | "apiType": "openai", 49 | "prompt": "How to explain Internet for a medieval knight?", 50 | "path": "v1/completions", 51 | "body": { 52 | "model": "microsoft/Phi-3-mini-128k-instruct", 53 | "max_tokens": "1000", 54 | "temperature": "0", 55 | "stream": "true" 56 | } 57 | }, 58 | "standby": { 59 | "gpu": "Blob", 60 | "pageable": "Blob", 61 | "pinned": "Blob" 62 | } 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /config/Phi-3-mini-4k-instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "microsoft", 5 | "name": "Phi-3-mini-4k-instruct", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "microsoft/Phi-3-mini-4k-instruct", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000" 16 | ], 17 | "resources": { 18 | "CPU": 12000, 19 | "Mem": 24000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 13000 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ], 31 | [ 32 | "VLLM_CUDART_SO_PATH", 33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 34 | ] 35 | ], 36 | "mounts": [ 37 | { 38 | "hostpath": "/home/brad/cache", 39 | "mountpath": "/root/.cache/huggingface" 40 | } 41 | ], 42 | "endpoint": { 43 | "port": 8000, 44 | "schema": "Http", 45 | "probe": "/health" 46 | }, 47 | "sample_query": { 48 | "apiType": "openai", 49 | "prompt": "Can you provide ways to eat combinations of bananas and dragonfruits?", 50 | "path": "v1/completions", 51 | "body": { 52 | "model": "microsoft/Phi-3-mini-4k-instruct", 53 | "max_tokens": "1000", 54 | "temperature": "0", 55 | "stream": "true" 56 | } 57 | }, 58 | "standby": { 59 | "gpu": "Blob", 60 | "pageable": "Blob", 61 | "pinned": "Blob" 62 | } 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /config/Qwen-VL-Chat.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "Qwen-VL-Chat", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Qwen/Qwen-VL-Chat", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 70000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "Qwen/Qwen-VL-Chat", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/Qwen.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "Qwen", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "Qwen/Qwen2.5-3B-Instruct", 11 | "--enforce-eager" 12 | ], 13 | "resources": { 14 | "CPU": 100, 15 | "Mem": 200, 16 | "GPU": { 17 | "Type": "Any", 18 | "Usage": { 19 | "Partial": 100 20 | } 21 | } 22 | }, 23 | "envs": [ 24 | [ 25 | "LD_LIBRARY_PATH", 26 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 27 | ] 28 | ], 29 | "mounts": [ 30 | { 31 | "hostpath": "/home/brad/cache", 32 | "mountpath": "/root/.cache/huggingface" 33 | } 34 | ], 35 | "endpoint": { 36 | "path": "/v1/completions", 37 | "port": 8000, 38 | "schema": "Http" 39 | }, 40 | "probe": { 41 | "path": "/health", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "api_type": { 46 | "openai": { 47 | "name": "Qwen/Qwen2.5-3B-Instruct", 48 | "max_tokens": 200, 49 | "temperature": 0 50 | } 51 | } 52 | } 53 | } -------------------------------------------------------------------------------- /config/Qwen1.5-MoE-A2.7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "Qwen1.5-MoE-A2.7B", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "Qwen/Qwen1.5-MoE-A2.7B", 11 | "--enforce-eager", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 6000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 15000 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "path": "/v1/completions", 41 | "port": 8000, 42 | "schema": "Http" 43 | }, 44 | "probe": { 45 | "path": "/health", 46 | "port": 8000, 47 | "schema": "Http" 48 | }, 49 | "api_type": { 50 | "openai": { 51 | "name": "Qwen/Qwen1.5-MoE-A2.7B", 52 | "max_tokens": 1000, 53 | "temperature": 0 54 | } 55 | }, 56 | "keepalive": "Blob" 57 | } 58 | } -------------------------------------------------------------------------------- /config/Qwen2.5-1.5B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Qwen", 5 | "name": "Qwen2.5-1.5B", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Qwen/Qwen2.5-1.5B", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000" 15 | ], 16 | "resources": { 17 | "CPU": 12000, 18 | "Mem": 24000, 19 | "GPU": { 20 | "Type": "Any", 21 | "Count": 1, 22 | "vRam": 8000 23 | } 24 | }, 25 | "envs": [ 26 | [ 27 | "LD_LIBRARY_PATH", 28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 29 | ], 30 | [ 31 | "VLLM_CUDART_SO_PATH", 32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 33 | ] 34 | ], 35 | "mounts": [ 36 | { 37 | "hostpath": "/home/brad/cache", 38 | "mountpath": "/root/.cache/huggingface" 39 | } 40 | ], 41 | "endpoint": { 42 | "port": 8000, 43 | "schema": "Http", 44 | "probe": "/health" 45 | }, 46 | "sample_query": { 47 | "apiType": "openai", 48 | "prompt": "Can you provide ways to eat combinations of bananas and dragonfruits?", 49 | "path": "v1/completions", 50 | "body": { 51 | "model": "Qwen/Qwen2.5-1.5B", 52 | "max_tokens": "1000", 53 | "temperature": "0", 54 | "stream": "true" 55 | } 56 | }, 57 | "standby": { 58 | "gpu": "Blob", 59 | "pageable": "Blob", 60 | "pinned": "Blob" 61 | } 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /config/Qwen2.5-7B-Instruct-GPTQ-Int8.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Qwen", 5 | "name": "Qwen2.5-7B-Instruct-GPTQ-Int8", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8", 12 | "--gpu-memory-utilization", 13 | "0.99", 14 | "--max-model-len", 15 | "500" 16 | ], 17 | "resources": { 18 | "CPU": 20000, 19 | "Mem": 30000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 14200 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ], 31 | [ 32 | "VLLM_CUDART_SO_PATH", 33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 34 | ] 35 | ], 36 | "mounts": [ 37 | { 38 | "hostpath": "/home/brad/cache", 39 | "mountpath": "/root/.cache/huggingface" 40 | } 41 | ], 42 | "endpoint": { 43 | "port": 8000, 44 | "schema": "Http", 45 | "probe": "/health" 46 | }, 47 | "sample_query": { 48 | "apiType": "openai", 49 | "prompt": "Give me a short introduction to large language model.", 50 | "path": "v1/completions", 51 | "body": { 52 | "model": "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8", 53 | "max_tokens": "300", 54 | "temperature": "0", 55 | "stream": "true" 56 | } 57 | }, 58 | "standby": { 59 | "gpu": "Blob", 60 | "pageable": "Blob", 61 | "pinned": "Blob" 62 | } 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /config/Qwen2.5-7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "models--Qwen--Qwen2.5-7B", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "Qwen/Qwen2.5-7B", 11 | "--enforce-eager", 12 | "--max-model-len", 13 | "2000", 14 | "--tensor-parallel-size=2" 15 | ], 16 | "resources": { 17 | "CPU": 6000, 18 | "Mem": 80000, 19 | "GPU": { 20 | "Type": "Any", 21 | "Count": 2, 22 | "vRam": 14000 23 | } 24 | }, 25 | "envs": [ 26 | [ 27 | "LD_LIBRARY_PATH", 28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 29 | ] 30 | ], 31 | "mounts": [ 32 | { 33 | "hostpath": "/home/brad/cache", 34 | "mountpath": "/root/.cache/huggingface" 35 | } 36 | ], 37 | "endpoint": { 38 | "path": "/v1/completions", 39 | "port": 8000, 40 | "schema": "Http" 41 | }, 42 | "probe": { 43 | "path": "/health", 44 | "port": 8000, 45 | "schema": "Http" 46 | }, 47 | "api_type": { 48 | "openai": { 49 | "name": "Qwen/Qwen2.5-7B", 50 | "max_tokens": 1000, 51 | "temperature": 0 52 | } 53 | } 54 | } 55 | } -------------------------------------------------------------------------------- /config/Qwen2.5-Coder-1.5B-Instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Qwen", 5 | "name": "Qwen2.5-Coder-1.5B-Instruct", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Qwen/Qwen2.5-Coder-1.5B-Instruct", 12 | "--max-model-len", 13 | "1000" 14 | ], 15 | "resources": { 16 | "CPU": 12000, 17 | "Mem": 24000, 18 | "GPU": { 19 | "Type": "Any", 20 | "Count": 1, 21 | "vRam": 6000 22 | } 23 | }, 24 | "envs": [ 25 | [ 26 | "LD_LIBRARY_PATH", 27 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 28 | ], 29 | [ 30 | "VLLM_CUDART_SO_PATH", 31 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "port": 8000, 42 | "schema": "Http", 43 | "probe": "/health" 44 | }, 45 | "sample_query": { 46 | "apiType": "openai", 47 | "prompt": "write a quick sort algorithm.", 48 | "path": "v1/completions", 49 | "body": { 50 | "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct", 51 | "max_tokens": "800", 52 | "temperature": "0", 53 | "stream": "true" 54 | } 55 | }, 56 | "standby": { 57 | "gpu": "Blob", 58 | "pageable": "Blob", 59 | "pinned": "Blob" 60 | } 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /config/Qwen2.5-Coder-3B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Qwen", 5 | "name": "Qwen2.5-Coder-3B", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Qwen/Qwen2.5-Coder-3B", 12 | "--max-model-len", 13 | "1000" 14 | ], 15 | "resources": { 16 | "CPU": 12000, 17 | "Mem": 24000, 18 | "GPU": { 19 | "Type": "Any", 20 | "Count": 1, 21 | "vRam": 10000 22 | } 23 | }, 24 | "envs": [ 25 | [ 26 | "LD_LIBRARY_PATH", 27 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 28 | ], 29 | [ 30 | "VLLM_CUDART_SO_PATH", 31 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "port": 8000, 42 | "schema": "Http", 43 | "probe": "/health" 44 | }, 45 | "sample_query": { 46 | "apiType": "openai", 47 | "prompt": "write a quick sort algorithm.", 48 | "path": "v1/completions", 49 | "body": { 50 | "model": "Qwen/Qwen2.5-Coder-3B", 51 | "max_tokens": "800", 52 | "temperature": "0", 53 | "stream": "true" 54 | } 55 | }, 56 | "standby": { 57 | "gpu": "Blob", 58 | "pageable": "Blob", 59 | "pinned": "Blob" 60 | } 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /config/Qwen2.5-Coder-7B-Instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Qwen", 5 | "name": "Qwen2.5-Coder-7B-Instruct", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Qwen/Qwen2.5-Coder-7B-Instruct", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13800 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ], 32 | [ 33 | "VLLM_CUDART_SO_PATH", 34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 35 | ] 36 | ], 37 | "mounts": [ 38 | { 39 | "hostpath": "/home/brad/cache", 40 | "mountpath": "/root/.cache/huggingface" 41 | } 42 | ], 43 | "endpoint": { 44 | "port": 8000, 45 | "schema": "Http", 46 | "probe": "/health" 47 | }, 48 | "sample_query": { 49 | "apiType": "openai", 50 | "prompt": "write a quick sort algorithm.", 51 | "path": "v1/completions", 52 | "body": { 53 | "model": "Qwen/Qwen2.5-Coder-7B-Instruct", 54 | "max_tokens": "1000", 55 | "temperature": "0", 56 | "stream": "true" 57 | } 58 | }, 59 | "standby": { 60 | "gpu": "Blob", 61 | "pageable": "Blob", 62 | "pinned": "Blob" 63 | } 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /config/Qwen2.5-Math-1.5B-Instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Qwen", 5 | "name": "Qwen2.5-Math-1.5B-Instruct", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Qwen/Qwen2.5-Math-1.5B-Instruct" 12 | ], 13 | "resources": { 14 | "CPU": 12000, 15 | "Mem": 24000, 16 | "GPU": { 17 | "Type": "Any", 18 | "Count": 1, 19 | "vRam": 7000 20 | } 21 | }, 22 | "envs": [ 23 | [ 24 | "LD_LIBRARY_PATH", 25 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 26 | ], 27 | [ 28 | "VLLM_CUDART_SO_PATH", 29 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "port": 8000, 40 | "schema": "Http", 41 | "probe": "/health" 42 | }, 43 | "sample_query": { 44 | "apiType": "openai", 45 | "prompt": "Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$.", 46 | "path": "v1/completions", 47 | "body": { 48 | "model": "Qwen/Qwen2.5-Math-1.5B-Instruct", 49 | "max_tokens": "200", 50 | "temperature": "0", 51 | "stream": "true" 52 | } 53 | }, 54 | "standby": { 55 | "gpu": "Blob", 56 | "pageable": "Blob", 57 | "pinned": "Blob" 58 | } 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /config/Qwen2.5-Math-1.5B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Qwen", 5 | "name": "Qwen2.5-Math-1.5B", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Qwen/Qwen2.5-Math-1.5B", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000" 15 | ], 16 | "resources": { 17 | "CPU": 12000, 18 | "Mem": 24000, 19 | "GPU": { 20 | "Type": "Any", 21 | "Count": 1, 22 | "vRam": 8000 23 | } 24 | }, 25 | "envs": [ 26 | [ 27 | "LD_LIBRARY_PATH", 28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 29 | ], 30 | [ 31 | "VLLM_CUDART_SO_PATH", 32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 33 | ] 34 | ], 35 | "mounts": [ 36 | { 37 | "hostpath": "/home/brad/cache", 38 | "mountpath": "/root/.cache/huggingface" 39 | } 40 | ], 41 | "endpoint": { 42 | "port": 8000, 43 | "schema": "Http", 44 | "probe": "/health" 45 | }, 46 | "sample_query": { 47 | "apiType": "openai", 48 | "prompt": "Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$.", 49 | "path": "v1/completions", 50 | "body": { 51 | "model": "Qwen/Qwen2.5-Math-1.5B", 52 | "max_tokens": "1000", 53 | "temperature": "0", 54 | "stream": "true" 55 | } 56 | }, 57 | "standby": { 58 | "gpu": "Blob", 59 | "pageable": "Blob", 60 | "pinned": "Blob" 61 | } 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /config/Qwen2.5-Math-7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Qwen", 5 | "name": "Qwen2.5-Math-7B", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "Qwen/Qwen2.5-Math-7B", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13800 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ], 32 | [ 33 | "VLLM_CUDART_SO_PATH", 34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 35 | ] 36 | ], 37 | "mounts": [ 38 | { 39 | "hostpath": "/home/brad/cache", 40 | "mountpath": "/root/.cache/huggingface" 41 | } 42 | ], 43 | "endpoint": { 44 | "port": 8000, 45 | "schema": "Http", 46 | "probe": "/health" 47 | }, 48 | "sample_query": { 49 | "apiType": "openai", 50 | "prompt": "Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$.", 51 | "path": "v1/completions", 52 | "body": { 53 | "model": "Qwen/Qwen2.5-Math-7B", 54 | "max_tokens": "1000", 55 | "temperature": "0", 56 | "stream": "true" 57 | } 58 | }, 59 | "standby": { 60 | "gpu": "Blob", 61 | "pageable": "Blob", 62 | "pinned": "Blob" 63 | } 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /config/Qwen7BInt8.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "Qwen", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8", 11 | "--enforce-eager", 12 | "--gpu-memory-utilization 0.99", 13 | "--max-model-len 1000" 14 | ], 15 | "resources": { 16 | "CPU": 100, 17 | "Mem": 200, 18 | "GPU": { 19 | "Type": "RTX3060", 20 | "Count": 1 21 | } 22 | }, 23 | "envs": [ 24 | [ 25 | "LD_LIBRARY_PATH", 26 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 27 | ] 28 | ], 29 | "mounts": [ 30 | { 31 | "hostpath": "/home/brad/cache", 32 | "mountpath": "/root/.cache/huggingface" 33 | } 34 | ], 35 | "endpoint": { 36 | "path": "/v1/completions", 37 | "port": 8000, 38 | "schema": "Http" 39 | }, 40 | "probe": { 41 | "path": "/health", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "api_type": { 46 | "openai": { 47 | "name": "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8", 48 | "max_tokens": 200, 49 | "temperature": 0 50 | } 51 | } 52 | } 53 | } -------------------------------------------------------------------------------- /config/Qwen_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "Qwen", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/Salesforce_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "Salesforce", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/THUDM_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "THUDM", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/TinyLlama-1.1B-Chat-v1.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "TinyLlama", 5 | "name": "TinyLlama-1.1B-Chat-v1.0", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000" 15 | ], 16 | "resources": { 17 | "CPU": 20000, 18 | "Mem": 24000, 19 | "GPU": { 20 | "Type": "Any", 21 | "Count": 1, 22 | "vRam": 4800 23 | } 24 | }, 25 | "envs": [ 26 | [ 27 | "LD_LIBRARY_PATH", 28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 29 | ], 30 | [ 31 | "VLLM_CUDART_SO_PATH", 32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 33 | ] 34 | ], 35 | "mounts": [ 36 | { 37 | "hostpath": "/home/brad/cache", 38 | "mountpath": "/root/.cache/huggingface" 39 | } 40 | ], 41 | "endpoint": { 42 | "port": 8000, 43 | "schema": "Http", 44 | "probe": "/health" 45 | }, 46 | "sample_query": { 47 | "apiType": "openai", 48 | "prompt": "Seattle is a", 49 | "path": "v1/completions", 50 | "body": { 51 | "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 52 | "max_tokens": "1000", 53 | "temperature": "0", 54 | "stream": "true" 55 | } 56 | }, 57 | "standby": { 58 | "gpu": "File", 59 | "pageable": "File", 60 | "pinned": "File" 61 | } 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /config/TinyLlama-1.1B-Chat-v1.0_13GB.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "TinyLlama", 5 | "name": "TinyLlama-1.1B-Chat-v1.0_13GB", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000" 15 | ], 16 | "resources": { 17 | "CPU": 20000, 18 | "Mem": 24000, 19 | "GPU": { 20 | "Type": "Any", 21 | "Count": 1, 22 | "vRam": 13800 23 | } 24 | }, 25 | "envs": [ 26 | [ 27 | "LD_LIBRARY_PATH", 28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 29 | ], 30 | [ 31 | "VLLM_CUDART_SO_PATH", 32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 33 | ] 34 | ], 35 | "mounts": [ 36 | { 37 | "hostpath": "/home/brad/cache", 38 | "mountpath": "/root/.cache/huggingface" 39 | } 40 | ], 41 | "endpoint": { 42 | "port": 8000, 43 | "schema": "Http", 44 | "probe": "/health" 45 | }, 46 | "sample_query": { 47 | "apiType": "openai", 48 | "prompt": "Seattle is a", 49 | "path": "v1/completions", 50 | "body": { 51 | "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 52 | "max_tokens": "1000", 53 | "temperature": "0", 54 | "stream": "true" 55 | } 56 | }, 57 | "standby": { 58 | "gpu": "Blob", 59 | "pageable": "Blob", 60 | "pinned": "Blob" 61 | } 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /config/TinyLlama-1.1B-Chat-v1.0_2gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "TinyLlama", 5 | "name": "TinyLlama-1.1B-Chat-v1.0_2gpu", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000", 15 | "--tensor-parallel-size=2" 16 | ], 17 | "resources": { 18 | "CPU": 20000, 19 | "Mem": 50000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 2, 23 | "vRam": 13800 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "port": 8000, 40 | "schema": "Http", 41 | "probe": "/health" 42 | }, 43 | "sample_query": { 44 | "apiType": "openai", 45 | "prompt": "Seattle is a", 46 | "path": "v1/completions", 47 | "body": { 48 | "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 49 | "max_tokens": "1000", 50 | "temperature": "0", 51 | "stream": "true" 52 | } 53 | }, 54 | "standby": { 55 | "gpu": "Blob", 56 | "pageable": "Blob", 57 | "pinned": "Blob" 58 | } 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /config/TinyLlama-1.1B-Chat-v1.0_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "TinyLlama-1.1B-Chat-v1.0_test", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.4.2", 9 | "commands": [ 10 | "--model", 11 | "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000" 15 | ], 16 | "resources": { 17 | "CPU": 20000, 18 | "Mem": 18000, 19 | "GPU": { 20 | "Type": "Any", 21 | "Count": 1, 22 | "vRam": 4500 23 | } 24 | }, 25 | "envs": [ 26 | [ 27 | "LD_LIBRARY_PATH", 28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 29 | ] 30 | ], 31 | "mounts": [ 32 | { 33 | "hostpath": "/home/brad/cache", 34 | "mountpath": "/root/.cache/huggingface" 35 | } 36 | ], 37 | "endpoint": { 38 | "port": 8000, 39 | "schema": "Http", 40 | "probe": "/health" 41 | }, 42 | "sample_query": { 43 | "apiType": "openai", 44 | "prompt": "Seattle is a", 45 | "path": "v1/completions", 46 | "body": { 47 | "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 48 | "max_tokens": "1000", 49 | "temperature": "0", 50 | "stream": "true" 51 | } 52 | }, 53 | "standby": { 54 | "gpu": "File", 55 | "pageable": "File", 56 | "pinned": "File" 57 | } 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/TinyLlama_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "TinyLlama", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/XVERSE-13B-Chat.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "XVERSE-13B-Chat", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "xverse/XVERSE-13B-Chat", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "xverse/XVERSE-13B-Chat", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/XVERSE-7B-Chat.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "XVERSE-7B-Chat", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "xverse/XVERSE-7B-Chat", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "xverse/XVERSE-7B-Chat", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/allenai_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "allenai", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/baichuan-inc_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "baichuan-inc", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/bigcode_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "bigcode", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/chatglm3-6b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "THUDM", 5 | "name": "chatglm3-6b", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "THUDM/chatglm3-6b", 12 | "--enforce-eager", 13 | "--max-model-len", 14 | "1500", 15 | "--gpu-memory-utilization", 16 | "0.99", 17 | "--trust-remote-code" 18 | ], 19 | "resources": { 20 | "CPU": 12000, 21 | "Mem": 24000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 1, 25 | "vRam": 13800 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ], 33 | [ 34 | "VLLM_CUDART_SO_PATH", 35 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 36 | ] 37 | ], 38 | "mounts": [ 39 | { 40 | "hostpath": "/home/brad/cache", 41 | "mountpath": "/root/.cache/huggingface" 42 | } 43 | ], 44 | "endpoint": { 45 | "port": 8000, 46 | "schema": "Http", 47 | "probe": "/health" 48 | }, 49 | "sample_query": { 50 | "apiType": "openai", 51 | "prompt": "Give me a short introduction to large language model.", 52 | "path": "v1/completions", 53 | "body": { 54 | "model": "THUDM/chatglm3-6b", 55 | "max_tokens": "200", 56 | "temperature": "0", 57 | "stream": "true" 58 | } 59 | }, 60 | "standby": { 61 | "gpu": "Blob", 62 | "pageable": "Blob", 63 | "pinned": "Blob" 64 | } 65 | } 66 | } 67 | } -------------------------------------------------------------------------------- /config/codegen-2B-multi.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "Salesforce", 5 | "name": "codegen-2B-multi", 6 | "object": { 7 | "spec": { 8 | "image": "vllm-openai-upgraded:v0.1.0", 9 | "entrypoint": [ 10 | "/usr/bin/python3" 11 | ], 12 | "commands": [ 13 | "/usr/lib/run_model.py", 14 | "Salesforce/codegen-2B-multi", 15 | "200" 16 | ], 17 | "resources": { 18 | "CPU": 20000, 19 | "Mem": 12000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 13000 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "port": 8000, 40 | "schema": "Http", 41 | "probe": "/health" 42 | }, 43 | "sample_query": { 44 | "apiType": "standard", 45 | "prompt": "def hello_world():", 46 | "path": "v1/completions", 47 | "body": { 48 | "model": "N/A", 49 | "max_tokens": "200", 50 | "temperature": "0" 51 | } 52 | }, 53 | "standby": { 54 | "gpu": "Blob", 55 | "pageable": "Blob", 56 | "pinned": "Blob" 57 | } 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/core42_jais-13b-bnb-4bit.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "core42_jais-13b-bnb-4bit", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "jwnder/core42_jais-13b-bnb-4bit", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000" 17 | ], 18 | "resources": { 19 | "CPU": 6000, 20 | "Mem": 20000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 1, 24 | "vRam": 15000 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "path": "/v1/completions", 41 | "port": 8000, 42 | "schema": "Http" 43 | }, 44 | "probe": { 45 | "path": "/health", 46 | "port": 8000, 47 | "schema": "Http" 48 | }, 49 | "api_type": { 50 | "openai": { 51 | "name": "jwnder/core42_jais-13b-bnb-4bit", 52 | "max_tokens": 1000, 53 | "temperature": 0 54 | } 55 | }, 56 | "keepalive": "Blob" 57 | } 58 | } 59 | } -------------------------------------------------------------------------------- /config/core42_jais-13b-chat-bnb-4bit.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "core42_jais-13b-chat-bnb-4bit", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "jwnder/core42_jais-13b-chat-bnb-4bit", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000" 17 | ], 18 | "resources": { 19 | "CPU": 6000, 20 | "Mem": 20000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 1, 24 | "vRam": 15000 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "path": "/v1/completions", 41 | "port": 8000, 42 | "schema": "Http" 43 | }, 44 | "probe": { 45 | "path": "/health", 46 | "port": 8000, 47 | "schema": "Http" 48 | }, 49 | "api_type": { 50 | "openai": { 51 | "name": "jwnder/core42_jais-13b-chat-bnb-4bit", 52 | "max_tokens": 1000, 53 | "temperature": 0 54 | } 55 | }, 56 | "keepalive": "Blob" 57 | } 58 | } 59 | } -------------------------------------------------------------------------------- /config/databricks_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "databricks", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/deepseek-ai_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "deepseek-ai", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/deepseek-math-7b-rl.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "deepseek-ai", 5 | "name": "deepseek-math-7b-rl", 6 | "object": { 7 | "spec": { 8 | "image": "vllm-openai-upgraded:v.0.1", 9 | "commands": [ 10 | "--model", 11 | "deepseek-ai/deepseek-math-7b-rl", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 20000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 13000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "port": 8000, 42 | "schema": "Http", 43 | "probe": "/health" 44 | }, 45 | "sample_query": { 46 | "apiType": "openai", 47 | "prompt": "what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \\boxed{}.", 48 | "path": "v1/completions", 49 | "body": { 50 | "model": "deepseek-ai/deepseek-math-7b-rl", 51 | "max_tokens": "1000", 52 | "temperature": "0", 53 | "stream": "true" 54 | } 55 | }, 56 | "standby": { 57 | "gpu": "Blob", 58 | "pageable": "Blob", 59 | "pinned": "Blob" 60 | } 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /config/deepseek-vl2-tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "deepseek-ai", 5 | "name": "deepseek-vl2-tiny", 6 | "object": { 7 | "spec": { 8 | "image": "vllm-openai-upgraded:v.0.1", 9 | "commands": [ 10 | "--model", 11 | "deepseek-ai/deepseek-vl2-tiny", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 20000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 13000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "port": 8000, 42 | "schema": "Http", 43 | "probe": "/health" 44 | }, 45 | "sample_query": { 46 | "apiType": "openai", 47 | "prompt": "What is the capital of USA?", 48 | "path": "v1/completions", 49 | "body": { 50 | "model": "deepseek-ai/deepseek-vl2-tiny", 51 | "max_tokens": "1000", 52 | "temperature": "0", 53 | "stream": "true" 54 | } 55 | }, 56 | "standby": { 57 | "gpu": "Blob", 58 | "pageable": "Blob", 59 | "pinned": "Blob" 60 | } 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /config/facebook_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "facebook", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/falcon-7b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "tiiuae", 5 | "name": "falcon-7b", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "tiiuae/falcon-7b", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "tiiuae/falcon-7b", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/falcon-rw-7b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "tiiuae", 5 | "name": "falcon-rw-7b", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "tiiuae/falcon-rw-7b", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000", 15 | "--tensor-parallel-size=2" 16 | ], 17 | "resources": { 18 | "CPU": 12000, 19 | "Mem": 80000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 2, 23 | "vRam": 13800 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ], 31 | [ 32 | "VLLM_CUDART_SO_PATH", 33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 34 | ] 35 | ], 36 | "mounts": [ 37 | { 38 | "hostpath": "/home/brad/cache", 39 | "mountpath": "/root/.cache/huggingface" 40 | } 41 | ], 42 | "endpoint": { 43 | "port": 8000, 44 | "schema": "Http", 45 | "probe": "/health" 46 | }, 47 | "sample_query": { 48 | "apiType": "openai", 49 | "prompt": "Here is a recipe for vegan banana bread:", 50 | "path": "v1/completions", 51 | "body": { 52 | "model": "tiiuae/falcon-rw-7b", 53 | "max_tokens": "1000", 54 | "temperature": "0", 55 | "stream": "true" 56 | } 57 | }, 58 | "standby": { 59 | "gpu": "Blob", 60 | "pageable": "Blob", 61 | "pinned": "Blob" 62 | } 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /config/gemma-7b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "gemma-7b", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "google/gemma-7b", 11 | "--enforce-eager", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 6000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 15000 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ] 32 | ], 33 | "mounts": [ 34 | { 35 | "hostpath": "/home/brad/cache", 36 | "mountpath": "/root/.cache/huggingface" 37 | } 38 | ], 39 | "endpoint": { 40 | "path": "/v1/completions", 41 | "port": 8000, 42 | "schema": "Http" 43 | }, 44 | "probe": { 45 | "path": "/health", 46 | "port": 8000, 47 | "schema": "Http" 48 | }, 49 | "api_type": { 50 | "openai": { 51 | "name": "google/gemma-7b", 52 | "max_tokens": 1000, 53 | "temperature": 0 54 | } 55 | }, 56 | "keepalive": "Blob" 57 | } 58 | } -------------------------------------------------------------------------------- /config/gpt-j-6b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "gpt-j-6b", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "EleutherAI/gpt-j-6b", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 70000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "EleutherAI/gpt-j-6b", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/gpt2-xl.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "openai-community", 5 | "name": "gpt2-xl", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "openai-community/gpt2-xl", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "800" 15 | ], 16 | "resources": { 17 | "CPU": 12000, 18 | "Mem": 24000, 19 | "GPU": { 20 | "Type": "Any", 21 | "Count": 1, 22 | "vRam": 12000 23 | } 24 | }, 25 | "envs": [ 26 | [ 27 | "LD_LIBRARY_PATH", 28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 29 | ], 30 | [ 31 | "VLLM_CUDART_SO_PATH", 32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 33 | ] 34 | ], 35 | "mounts": [ 36 | { 37 | "hostpath": "/home/brad/cache", 38 | "mountpath": "/root/.cache/huggingface" 39 | } 40 | ], 41 | "endpoint": { 42 | "port": 8000, 43 | "schema": "Http", 44 | "probe": "/health" 45 | }, 46 | "sample_query": { 47 | "apiType": "openai", 48 | "prompt": "Here is a recipe for vegan banana bread:", 49 | "path": "v1/completions", 50 | "body": { 51 | "model": "openai-community/gpt2-xl", 52 | "max_tokens": "600", 53 | "temperature": "0", 54 | "stream": "true" 55 | } 56 | }, 57 | "standby": { 58 | "gpu": "Blob", 59 | "pageable": "Blob", 60 | "pinned": "Blob" 61 | } 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /config/gpt4all-j.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "nomic-ai", 5 | "name": "gpt4all-j", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "nomic-ai/gpt4all-j", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 60000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13800 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ], 32 | [ 33 | "VLLM_CUDART_SO_PATH", 34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 35 | ] 36 | ], 37 | "mounts": [ 38 | { 39 | "hostpath": "/home/brad/cache", 40 | "mountpath": "/root/.cache/huggingface" 41 | } 42 | ], 43 | "endpoint": { 44 | "port": 8000, 45 | "schema": "Http", 46 | "probe": "/health" 47 | }, 48 | "sample_query": { 49 | "apiType": "openai", 50 | "prompt": "Here is a recipe for vegan banana bread:", 51 | "path": "v1/completions", 52 | "body": { 53 | "model": "nomic-ai/gpt4all-j", 54 | "max_tokens": "1000", 55 | "temperature": "0", 56 | "stream": "true" 57 | } 58 | }, 59 | "standby": { 60 | "gpu": "Blob", 61 | "pageable": "Blob", 62 | "pinned": "Blob" 63 | } 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /config/internlm2-7b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "internlm2-7b", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "internlm/internlm2-7b", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "internlm/internlm2-7b", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/internlm2_5-7b-chat.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "internlm2_5-7b-chat", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "internlm/internlm2_5-7b-chat", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "internlm/internlm2_5-7b-chat", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/llama_8BInt8.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "TinyLlama-1.1B-Chat-v1.0_2gpu", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "meta-llama/Llama-Guard-3-8B-INT8", 11 | "--enforce-eager", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000", 15 | "--tensor-parallel-size=2" 16 | ], 17 | "resources": { 18 | "CPU": 6000, 19 | "Mem": 50000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 2, 23 | "vRam": 13800 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/Quark/target/debug/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "path": "/v1/completions", 40 | "port": 8000, 41 | "schema": "Http" 42 | }, 43 | "probe": { 44 | "path": "/health", 45 | "port": 8000, 46 | "schema": "Http" 47 | }, 48 | "api_type": { 49 | "openai": { 50 | "name": "meta-llama/Llama-Guard-3-8B-INT8", 51 | "max_tokens": 1000, 52 | "temperature": 0 53 | } 54 | }, 55 | "keepalive": "Blob" 56 | } 57 | } -------------------------------------------------------------------------------- /config/llava-1.5-7b-hf.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "llava-hf", 5 | "name": "llava-1.5-7b-hf", 6 | "object": { 7 | "spec": { 8 | "image": "vllm-openai-upgraded:v0.1.0", 9 | "entrypoint": [ 10 | "/usr/bin/python3" 11 | ], 12 | "commands": [ 13 | "/usr/lib/run_llava.py" 14 | ], 15 | "resources": { 16 | "CPU": 20000, 17 | "Mem": 12000, 18 | "GPU": { 19 | "Type": "Any", 20 | "Count": 1, 21 | "vRam": 14000 22 | } 23 | }, 24 | "envs": [ 25 | [ 26 | "LD_LIBRARY_PATH", 27 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 28 | ] 29 | ], 30 | "mounts": [ 31 | { 32 | "hostpath": "/home/brad/cache", 33 | "mountpath": "/root/.cache/huggingface" 34 | } 35 | ], 36 | "endpoint": { 37 | "port": 8000, 38 | "schema": "Http", 39 | "probe": "/health" 40 | }, 41 | "sample_query": { 42 | "apiType": "llava", 43 | "prompt": "What is shown in this image?", 44 | "path": "v1/completions", 45 | "body": { 46 | "image": "https://www.ilankelman.org/stopsigns/australia.jpg" 47 | } 48 | }, 49 | "standby": { 50 | "gpu": "Blob", 51 | "pageable": "Blob", 52 | "pinned": "Blob" 53 | } 54 | } 55 | } 56 | } -------------------------------------------------------------------------------- /config/llava-hf_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "llava-hf", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/mamba-1.4b-hf.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "state-spaces", 5 | "name": "mamba-1.4b-hf", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "state-spaces/mamba-1.4b-hf", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--max-model-len", 15 | "2000" 16 | ], 17 | "resources": { 18 | "CPU": 12000, 19 | "Mem": 50000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 13800 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "port": 8000, 40 | "schema": "Http", 41 | "probe": "/health" 42 | }, 43 | "sample_query": { 44 | "apiType": "openai", 45 | "prompt": "Hey how are you doing?\n\nI'm doing great.\n\nI", 46 | "path": "v1/completions", 47 | "body": { 48 | "model": "state-spaces/mamba-1.4b-hf", 49 | "max_tokens": "1000", 50 | "temperature": "0", 51 | "stream": "true" 52 | } 53 | }, 54 | "standby": { 55 | "gpu": "Blob", 56 | "pageable": "Blob", 57 | "pinned": "Blob" 58 | } 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /config/mamba-2.8b-hf.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "state-spaces", 5 | "name": "mamba-2.8b-hf", 6 | "object": { 7 | "spec": { 8 | "image": "vllm-openai-upgraded:v.0.1", 9 | "commands": [ 10 | "--model", 11 | "state-spaces/mamba-2.8b-hf", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--max-model-len", 15 | "2000" 16 | ], 17 | "resources": { 18 | "CPU": 12000, 19 | "Mem": 50000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 13800 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "port": 8000, 40 | "schema": "Http", 41 | "probe": "/health" 42 | }, 43 | "sample_query": { 44 | "apiType": "openai", 45 | "prompt": "Hey how are you doing?\n\nI'm doing great.\n\nI", 46 | "path": "v1/completions", 47 | "body": { 48 | "model": "state-spaces/mamba-2.8b-hf", 49 | "max_tokens": "1000", 50 | "temperature": "0", 51 | "stream": "true" 52 | } 53 | }, 54 | "standby": { 55 | "gpu": "Blob", 56 | "pageable": "Blob", 57 | "pinned": "Blob" 58 | } 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /config/meta-llama_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "meta-llama", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/microsoft_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "microsoft", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/mistral.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "mistral", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "mistralai/Mistral-7B-v0.1", 11 | "--enforce-eager" 12 | ], 13 | "resources": { 14 | "CPU": 100, 15 | "Mem": 200, 16 | "GPU": { 17 | "Type": "RTX3060", 18 | "Count": 2 19 | } 20 | }, 21 | "envs": [ 22 | [ 23 | "LD_LIBRARY_PATH", 24 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 25 | ] 26 | ], 27 | "mounts": [ 28 | { 29 | "hostpath": "/home/brad/cache", 30 | "mountpath": "/root/.cache/huggingface" 31 | } 32 | ], 33 | "endpoint": { 34 | "path": "/v1/completions", 35 | "port": 8000, 36 | "schema": "Http" 37 | }, 38 | "probe": { 39 | "path": "/health", 40 | "port": 8000, 41 | "schema": "Http" 42 | }, 43 | "api_type": { 44 | "openai": { 45 | "name": "mistralai/Mistral-7B-v0.1", 46 | "max_tokens": 200, 47 | "temperature": 0 48 | } 49 | } 50 | } 51 | } -------------------------------------------------------------------------------- /config/mistralai_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "mistralai", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/mosaicml_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "mosaicml", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/mpt-7b-storywriter.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "mosaicml", 5 | "name": "mpt-7b-storywriter", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "mosaicml/mpt-7b-storywriter", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "1000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13800 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ], 32 | [ 33 | "VLLM_CUDART_SO_PATH", 34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 35 | ] 36 | ], 37 | "mounts": [ 38 | { 39 | "hostpath": "/home/brad/cache", 40 | "mountpath": "/root/.cache/huggingface" 41 | } 42 | ], 43 | "endpoint": { 44 | "port": 8000, 45 | "schema": "Http", 46 | "probe": "/health" 47 | }, 48 | "sample_query": { 49 | "apiType": "openai", 50 | "prompt": "Here is a recipe for vegan banana bread:", 51 | "path": "v1/completions", 52 | "body": { 53 | "model": "mosaicml/mpt-7b-storywriter", 54 | "max_tokens": "800", 55 | "temperature": "0", 56 | "stream": "true" 57 | } 58 | }, 59 | "standby": { 60 | "gpu": "Blob", 61 | "pageable": "Blob", 62 | "pinned": "Blob" 63 | } 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /config/mpt-7b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "mosaicml", 5 | "name": "mpt-7b", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "mosaicml/mpt-7b", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "1000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13800 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ], 32 | [ 33 | "VLLM_CUDART_SO_PATH", 34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 35 | ] 36 | ], 37 | "mounts": [ 38 | { 39 | "hostpath": "/home/brad/cache", 40 | "mountpath": "/root/.cache/huggingface" 41 | } 42 | ], 43 | "endpoint": { 44 | "port": 8000, 45 | "schema": "Http", 46 | "probe": "/health" 47 | }, 48 | "sample_query": { 49 | "apiType": "openai", 50 | "prompt": "Here is a recipe for vegan banana bread:", 51 | "path": "v1/completions", 52 | "body": { 53 | "model": "mosaicml/mpt-7b", 54 | "max_tokens": "800", 55 | "temperature": "0", 56 | "stream": "true" 57 | } 58 | }, 59 | "standby": { 60 | "gpu": "Blob", 61 | "pageable": "Blob", 62 | "pinned": "Blob" 63 | } 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /config/namespace1.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "t1", 4 | "namespace": "system", 5 | "name": "ns1", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/nomic-ai_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "nomic-ai", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/ns1_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "t1", 4 | "namespace": "system", 5 | "name": "ns1", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/openai-community_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "openai-community", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/openbmb_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "openbmb", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/opt-iml-max-1.3b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "facebook", 5 | "name": "opt-iml-max-1.3b", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "facebook/opt-iml-max-1.3b", 12 | "--max-model-len", 13 | "200" 14 | ], 15 | "resources": { 16 | "CPU": 12000, 17 | "Mem": 24000, 18 | "GPU": { 19 | "Type": "Any", 20 | "Count": 1, 21 | "vRam": 4500 22 | } 23 | }, 24 | "envs": [ 25 | [ 26 | "LD_LIBRARY_PATH", 27 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 28 | ], 29 | [ 30 | "VLLM_CUDART_SO_PATH", 31 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "port": 8000, 42 | "schema": "Http", 43 | "probe": "/health" 44 | }, 45 | "sample_query": { 46 | "apiType": "openai", 47 | "prompt": "What is the capital of USA?", 48 | "path": "v1/completions", 49 | "body": { 50 | "model": "facebook/opt-iml-max-1.3b", 51 | "max_tokens": "100", 52 | "temperature": "0", 53 | "stream": "true" 54 | } 55 | }, 56 | "standby": { 57 | "gpu": "Mem", 58 | "pageable": "File", 59 | "pinned": "Mem" 60 | } 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /config/persimmon-8b-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "persimmon-8b-base", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "adept/persimmon-8b-base", 11 | "--enforce-eager", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000" 16 | ], 17 | "resources": { 18 | "CPU": 6000, 19 | "Mem": 18000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 2, 23 | "vRam": 15000 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "path": "/v1/completions", 40 | "port": 8000, 41 | "schema": "Http" 42 | }, 43 | "probe": { 44 | "path": "/health", 45 | "port": 8000, 46 | "schema": "Http" 47 | }, 48 | "api_type": { 49 | "openai": { 50 | "name": "adept/persimmon-8b-base", 51 | "max_tokens": 1000, 52 | "temperature": 0 53 | } 54 | }, 55 | "keepalive": "Blob" 56 | } 57 | } -------------------------------------------------------------------------------- /config/persimmon-8b-chat.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "persimmon-8b-chat", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "adept/persimmon-8b-chat", 11 | "--enforce-eager", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000" 16 | ], 17 | "resources": { 18 | "CPU": 6000, 19 | "Mem": 18000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 2, 23 | "vRam": 15000 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "path": "/v1/completions", 40 | "port": 8000, 41 | "schema": "Http" 42 | }, 43 | "probe": { 44 | "path": "/health", 45 | "port": 8000, 46 | "schema": "Http" 47 | }, 48 | "api_type": { 49 | "openai": { 50 | "name": "adept/persimmon-8b-chat", 51 | "max_tokens": 1000, 52 | "temperature": 0 53 | } 54 | }, 55 | "keepalive": "Blob" 56 | } 57 | } -------------------------------------------------------------------------------- /config/public.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "tenant", 3 | "tenant": "system", 4 | "namespace": "system", 5 | "name": "public", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/reader.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "reader-lm", 6 | "spec": { 7 | "image": "vllm/vllm-openai:v0.7.3", 8 | "commands": [ 9 | "--model", 10 | "jinaai/reader-lm-1.5b", 11 | "--enforce-eager" 12 | ], 13 | "resources": { 14 | "CPU": 100, 15 | "Mem": 200, 16 | "GPU": { 17 | "Type": "RTX3060", 18 | "Count": 1 19 | } 20 | }, 21 | "envs": [ 22 | [ 23 | "LD_LIBRARY_PATH", 24 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 25 | ] 26 | ], 27 | "mounts": [ 28 | { 29 | "hostpath": "/home/brad/cache", 30 | "mountpath": "/root/.cache/huggingface" 31 | } 32 | ], 33 | "endpoint": { 34 | "path": "/v1/completions", 35 | "port": 8000, 36 | "schema": "Http" 37 | }, 38 | "probe": { 39 | "path": "/health", 40 | "port": 8000, 41 | "schema": "Http" 42 | }, 43 | "api_type": { 44 | "openai": { 45 | "name": "jinaai/reader-lm-1.5b", 46 | "max_tokens": 200, 47 | "temperature": 0 48 | } 49 | } 50 | } 51 | } -------------------------------------------------------------------------------- /config/stabilityai_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "stabilityai", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/stable-diffusion-xl-base-1.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "stabilityai", 5 | "name": "stable-diffusion-xl-base-1.0", 6 | "object": { 7 | "spec": { 8 | "image": "vllm-openai-upgraded:v0.1.0", 9 | "entrypoint": [ 10 | "/usr/bin/python3" 11 | ], 12 | "commands": [ 13 | "/usr/lib/run_stablediffusion.py" 14 | ], 15 | "resources": { 16 | "CPU": 20000, 17 | "Mem": 50000, 18 | "GPU": { 19 | "Type": "Any", 20 | "Count": 1, 21 | "vRam": 13800 22 | } 23 | }, 24 | "envs": [ 25 | [ 26 | "height", 27 | "512" 28 | ], 29 | [ 30 | "width", 31 | "512" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "port": 8000, 42 | "schema": "Http", 43 | "probe": "/health" 44 | }, 45 | "sample_query": { 46 | "apiType": "text2img", 47 | "prompt": "An astronaut riding a green horse", 48 | "path": "funccall", 49 | "body": {} 50 | }, 51 | "standby": { 52 | "gpu": "Blob", 53 | "pageable": "Blob", 54 | "pinned": "Blob" 55 | } 56 | } 57 | } 58 | } -------------------------------------------------------------------------------- /config/stablelm-3b-4e1t.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "stablelm-3b-4e1t", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "stabilityai/stablelm-3b-4e1t", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--max-model-len", 15 | "2000" 16 | ], 17 | "resources": { 18 | "CPU": 6000, 19 | "Mem": 18000, 20 | "GPU": { 21 | "Type": "Any", 22 | "Count": 1, 23 | "vRam": 8000 24 | } 25 | }, 26 | "envs": [ 27 | [ 28 | "LD_LIBRARY_PATH", 29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 30 | ] 31 | ], 32 | "mounts": [ 33 | { 34 | "hostpath": "/home/brad/cache", 35 | "mountpath": "/root/.cache/huggingface" 36 | } 37 | ], 38 | "endpoint": { 39 | "path": "/v1/completions", 40 | "port": 8000, 41 | "schema": "Http" 42 | }, 43 | "probe": { 44 | "path": "/health", 45 | "port": 8000, 46 | "schema": "Http" 47 | }, 48 | "api_type": { 49 | "openai": { 50 | "name": "stabilityai/stablelm-3b-4e1t", 51 | "max_tokens": 1000, 52 | "temperature": 0 53 | } 54 | }, 55 | "keepalive": "Blob" 56 | } 57 | } 58 | } -------------------------------------------------------------------------------- /config/stablelm-tuned-alpha-7b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "t1", 4 | "namespace": "ns1", 5 | "name": "stablelm-tuned-alpha-7b", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "stabilityai/stablelm-tuned-alpha-7b", 12 | "--enforce-eager", 13 | "--disable-custom-all-reduce", 14 | "--trust-remote-code", 15 | "--max-model-len", 16 | "2000", 17 | "--tensor-parallel-size=2" 18 | ], 19 | "resources": { 20 | "CPU": 6000, 21 | "Mem": 50000, 22 | "GPU": { 23 | "Type": "Any", 24 | "Count": 2, 25 | "vRam": 15000 26 | } 27 | }, 28 | "envs": [ 29 | [ 30 | "LD_LIBRARY_PATH", 31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 32 | ] 33 | ], 34 | "mounts": [ 35 | { 36 | "hostpath": "/home/brad/cache", 37 | "mountpath": "/root/.cache/huggingface" 38 | } 39 | ], 40 | "endpoint": { 41 | "path": "/v1/completions", 42 | "port": 8000, 43 | "schema": "Http" 44 | }, 45 | "probe": { 46 | "path": "/health", 47 | "port": 8000, 48 | "schema": "Http" 49 | }, 50 | "api_type": { 51 | "openai": { 52 | "name": "stabilityai/stablelm-tuned-alpha-7b", 53 | "max_tokens": 1000, 54 | "temperature": 0 55 | } 56 | }, 57 | "keepalive": "Blob" 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /config/starcoder2-3b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "bigcode", 5 | "name": "starcoder2-3b", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "bigcode/starcoder2-3b", 12 | "--disable-custom-all-reduce", 13 | "--max-model-len", 14 | "2000" 15 | ], 16 | "resources": { 17 | "CPU": 12000, 18 | "Mem": 50000, 19 | "GPU": { 20 | "Type": "Any", 21 | "Count": 1, 22 | "vRam": 13800 23 | } 24 | }, 25 | "envs": [ 26 | [ 27 | "LD_LIBRARY_PATH", 28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 29 | ], 30 | [ 31 | "VLLM_CUDART_SO_PATH", 32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 33 | ] 34 | ], 35 | "mounts": [ 36 | { 37 | "hostpath": "/home/brad/cache", 38 | "mountpath": "/root/.cache/huggingface" 39 | } 40 | ], 41 | "endpoint": { 42 | "port": 8000, 43 | "schema": "Http", 44 | "probe": "/health" 45 | }, 46 | "sample_query": { 47 | "apiType": "openai", 48 | "prompt": "def print_hello_world():", 49 | "path": "v1/completions", 50 | "body": { 51 | "model": "bigcode/starcoder2-3b", 52 | "max_tokens": "1000", 53 | "temperature": "0", 54 | "stream": "true" 55 | } 56 | }, 57 | "standby": { 58 | "gpu": "Blob", 59 | "pageable": "Blob", 60 | "pinned": "Blob" 61 | } 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /config/starcoder2-7b.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "function", 3 | "tenant": "public", 4 | "namespace": "bigcode", 5 | "name": "starcoder2-7b", 6 | "object": { 7 | "spec": { 8 | "image": "vllm/vllm-openai:v0.7.3", 9 | "commands": [ 10 | "--model", 11 | "bigcode/starcoder2-7b", 12 | "--disable-custom-all-reduce", 13 | "--trust-remote-code", 14 | "--max-model-len", 15 | "2000", 16 | "--tensor-parallel-size=2" 17 | ], 18 | "resources": { 19 | "CPU": 20000, 20 | "Mem": 50000, 21 | "GPU": { 22 | "Type": "Any", 23 | "Count": 2, 24 | "vRam": 13800 25 | } 26 | }, 27 | "envs": [ 28 | [ 29 | "LD_LIBRARY_PATH", 30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH" 31 | ], 32 | [ 33 | "VLLM_CUDART_SO_PATH", 34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12" 35 | ] 36 | ], 37 | "mounts": [ 38 | { 39 | "hostpath": "/home/brad/cache", 40 | "mountpath": "/root/.cache/huggingface" 41 | } 42 | ], 43 | "endpoint": { 44 | "port": 8000, 45 | "schema": "Http", 46 | "probe": "/health" 47 | }, 48 | "sample_query": { 49 | "apiType": "openai", 50 | "prompt": "def print_hello_world():", 51 | "path": "v1/completions", 52 | "body": { 53 | "model": "bigcode/starcoder2-7b", 54 | "max_tokens": "1000", 55 | "temperature": "0", 56 | "stream": "true" 57 | } 58 | }, 59 | "standby": { 60 | "gpu": "Blob", 61 | "pageable": "Blob", 62 | "pinned": "Blob" 63 | } 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /config/state-spaces_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "state-spaces", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/tenant1.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "tenant", 3 | "tenant": "system", 4 | "namespace": "system", 5 | "name": "t1", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /config/tiiuae_namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "namespace", 3 | "tenant": "public", 4 | "namespace": "system", 5 | "name": "tiiuae", 6 | "object": { 7 | "spec": {}, 8 | "status": { 9 | "disable": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /dashboard/Makefile: -------------------------------------------------------------------------------- 1 | # pip install grpcio grpcio-tools 2 | # pip install psycopg2-binary 3 | all: protoc 4 | run: 5 | KEYCLOAK_URL=http://192.168.0.22:1260/authn python3 ./app.py 6 | 7 | protoc: 8 | python3 -m grpc_tools.protoc -I ../qshare/proto --python_out=. --grpc_python_out=. qobjs.proto 9 | python3 -m grpc_tools.protoc -I ../qshare/proto --python_out=. --grpc_python_out=. na.proto 10 | -------------------------------------------------------------------------------- /dashboard/__pycache__/na_pb2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/na_pb2.cpython-38.pyc -------------------------------------------------------------------------------- /dashboard/__pycache__/na_pb2_grpc.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/na_pb2_grpc.cpython-38.pyc -------------------------------------------------------------------------------- /dashboard/__pycache__/qobjs_pb2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/qobjs_pb2.cpython-38.pyc -------------------------------------------------------------------------------- /dashboard/__pycache__/qobjs_pb2_grpc.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/qobjs_pb2_grpc.cpython-38.pyc -------------------------------------------------------------------------------- /dashboard/doc: -------------------------------------------------------------------------------- 1 | ../doc -------------------------------------------------------------------------------- /dashboard/gunicorn.conf.py: -------------------------------------------------------------------------------- 1 | bind = "0.0.0.0:1250" 2 | workers = 4 3 | worker_class = "gevent" 4 | timeout = 30 -------------------------------------------------------------------------------- /dashboard/requirements.txt: -------------------------------------------------------------------------------- 1 | aiofiles==23.1.0 2 | async-generator==1.10 3 | blinker==1.6.2 4 | exceptiongroup==1.1.1 5 | Flask==2.3.2 6 | grpcio==1.54.2 7 | grpcio-tools==1.54.2 8 | h2==3.2.0 9 | hpack==3.0.0 10 | html5tagger==1.3.0 11 | httptools==0.5.0 12 | hyperframe==5.2.0 13 | itsdangerous==2.1.2 14 | janus==1.0.0 15 | Jinja2==3.1.2 16 | MarkupSafe==2.1.3 17 | multidict==6.0.4 18 | numpy==1.24.3 19 | protobuf==4.23.2 20 | purerpc==0.8.0 21 | sanic==23.3.0 22 | sanic-routing==22.8.0 23 | sniffio==1.3.0 24 | tracerite==1.1.0 25 | typing_extensions==4.6.3 26 | ujson==5.7.0 27 | uvloop==0.17.0 28 | websockets==11.0.3 29 | Werkzeug==2.3.6 30 | flask-cors==4.0.1 31 | requests==2.25.1 32 | markdown==3.7 33 | gunicorn==23.0.0 34 | gevent==24.2.1 35 | Authlib==1.3.2 36 | -------------------------------------------------------------------------------- /dashboard/sql/kv.sql: -------------------------------------------------------------------------------- 1 | DROP DATABASE auditdb; 2 | CREATE DATABASE auditdb; 3 | 4 | \c auditdb; 5 | 6 | DROP TABLE Pod; 7 | CREATE TABLE Pod ( 8 | tenant VARCHAR NOT NULL, 9 | namespace VARCHAR NOT NULL, 10 | fpname VARCHAR NOT NULL, 11 | fprevision bigint, 12 | id VARCHAR NOT NULL, 13 | nodename VARCHAR NOT NULL, 14 | state VARCHAR NOT NULL, 15 | updatetime TIMESTAMP, 16 | PRIMARY KEY(tenant, namespace, fpname, fprevision, id) 17 | ); 18 | 19 | DROP TABLE PodAudit; 20 | CREATE TABLE PodAudit ( 21 | tenant VARCHAR NOT NULL, 22 | namespace VARCHAR NOT NULL, 23 | fpname VARCHAR NOT NULL, 24 | fprevision bigint, 25 | id VARCHAR NOT NULL, 26 | nodename VARCHAR NOT NULL, 27 | action VARCHAR NOT NULL, 28 | state VARCHAR NOT NULL, 29 | updatetime TIMESTAMP, 30 | PRIMARY KEY(tenant, namespace, fpname, fprevision, id, updatetime) 31 | ); 32 | 33 | DROP TABLE ReqAudit; 34 | CREATE TABLE ReqAudit ( 35 | seqid SERIAL PRIMARY KEY, 36 | podkey VARCHAR NOT NULL, 37 | audittime TIMESTAMP, 38 | keepalive bool, 39 | ttft int, -- Time to First Token 40 | latency int 41 | ); 42 | 43 | CREATE USER audit_user WITH PASSWORD '123456'; 44 | GRANT ALL ON ALL TABLES IN SCHEMA public to audit_user; 45 | GRANT USAGE ON SEQUENCE reqaudit_seqid_seq TO audit_user; 46 | 47 | -- https://stackoverflow.com/questions/18664074/getting-error-peer-authentication-failed-for-user-postgres-when-trying-to-ge 48 | 49 | DROP DATABASE testdb; 50 | CREATE DATABASE testdb; 51 | 52 | \c testdb; 53 | 54 | DROP TABLE Pod; 55 | CREATE TABLE Pod ( 56 | tenant VARCHAR NOT NULL 57 | ); 58 | 59 | insert into pod values ('asdf'); 60 | 61 | CREATE OR REPLACE FUNCTION notification_trigger() RETURNS TRIGGER AS 62 | $$ 63 | BEGIN 64 | PERFORM pg_notify('your_channel_name', 65 | to_json(NEW)::TEXT 66 | ); 67 | RETURN NEW; 68 | END; 69 | $$ LANGUAGE plpgsql; 70 | 71 | CREATE OR REPLACE TRIGGER capture_change_trigger AFTER INSERT OR UPDATE OR DELETE ON pod 72 | FOR EACH ROW EXECUTE FUNCTION notification_trigger(); 73 | 74 | -------------------------------------------------------------------------------- /dashboard/sql/secret.sql: -------------------------------------------------------------------------------- 1 | --DROP TABLE ApiKey; 2 | CREATE TABLE Apikey ( 3 | apikey VARCHAR NOT NULL, 4 | username VARCHAR NOT NULL, 5 | keyname VARCHAR NOT NULL, 6 | createtime TIMESTAMP, 7 | PRIMARY KEY(apikey) 8 | ); 9 | 10 | CREATE UNIQUE INDEX apikey_idx_realm_username ON Apikey (username, keyname); 11 | 12 | CREATE TABLE UserRole ( 13 | username VARCHAR NOT NULL, 14 | rolename VARCHAR NOT NULL, 15 | PRIMARY KEY(username, rolename) 16 | ); 17 | 18 | CREATE INDEX userrole_idx_rolename ON UserRole (rolename); 19 | -------------------------------------------------------------------------------- /dashboard/static/button.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/static/button.gif -------------------------------------------------------------------------------- /dashboard/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block content %} 4 | 5 |

func audit

6 | 7 | 14 | 15 | {% endblock %} -------------------------------------------------------------------------------- /dashboard/templates/log.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block content %} 4 |

func log

5 | 6 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
namespacefunc namefunc id
{{ namespace }}{{ funcName }}{{ funcId }}
23 | 24 | {{ log | safe }} 25 | {% endblock %} -------------------------------------------------------------------------------- /dashboard/templates/markdown.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block content %} 4 | 5 | 6 | 13 | 14 | 15 | 16 | 17 |
{{ md_content|safe }}
18 | 19 | 20 | 21 | 22 | 23 | {{ log | safe }} 24 | {% endblock %} -------------------------------------------------------------------------------- /dashboard/templates/node.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block content %} 4 |

Node

5 | 6 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | {% autoescape false %} 21 | 22 | {% endautoescape %} 23 | 24 |
Node NameNode
{{ name }}{{ node }}
25 | 26 | {{ log | safe }} 27 | {% endblock %} -------------------------------------------------------------------------------- /dashboard/templates/node_list.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block content %} 4 |

Nodes

5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | {% for node in nodes %} 24 | 25 | 27 | {% autoescape false %} 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | {% endautoescape %} 36 | 37 | {% endfor %} 38 |
nodenameIp AddressCIDRCPU CountCPU Memory (GB) MaxContextPerGPU BlobStore GPUs
{{ 26 | node["name"] }}{{ node['object']['nodeIp'] }}{{ node['object']['cidr'] }}{{ node['object']['resources']['CPU'] // 1000 }}{{ node['object']['resources']['Mem'] // 1000 }}{{ node['object']['resources']['MaxContextPerGPU'] }}{{ node['object']['blobStoreEnable'] }}{{ node['object']['resources']['GPUs'] }}
39 | {{ hosturl }} 40 | {% endblock %} -------------------------------------------------------------------------------- /dashboard/templates/pod.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block content %} 4 | 5 |

funcpod

6 | 7 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
tenantnamespacepodname
{{ tenant }}{{ namespace }}{{ podname }}
26 | 27 | {% if audits %} 28 |

state

29 | 30 | 31 | 32 | 33 | 34 | {% for audit in audits %} 35 | 36 | 37 | 38 | 39 | {% endfor %} 40 |
statetime
{{ audit["state"] }}{{ audit["updatetime"] }}
41 | {% endif %} 42 |

log

43 | 44 | 45 | {% autoescape false %} 46 | 47 | {% endautoescape %} 48 | 49 |
{{ log }}
50 | 51 | 59 | {{ hosturl }} 60 | {% endblock %} -------------------------------------------------------------------------------- /dashboard/templates/snapshot_list.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block content %} 4 | 11 | 12 |

Snapshots

13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | {% for snapshot in snapshots %} 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | {% endfor %} 36 |
Snapshot Idnodenamestategpupageablepinneddocker image namebuild id
{{ snapshot["name"] }}{{ snapshot["object"]['nodename'] }}{{ snapshot["object"]['state'] }}{{ snapshot["object"]['info']['gpuMemSizes'] }}{{ snapshot["object"]['info']['processCheckpointSize'] // (1024*1024) }} MB{{ snapshot["object"]['info']['hostMemSize'] // (1024*1024) }} MB {{ snapshot["object"]['meta']['imagename'] }}{{ snapshot["object"]['meta']['buildId'] }}
37 | {{ hosturl }} 38 | {% endblock %} -------------------------------------------------------------------------------- /deployment/dashboard.Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | FROM python:3.10-slim-buster 4 | 5 | WORKDIR / 6 | 7 | RUN apt-get -y update 8 | RUN apt-get install -y libpq-dev gcc 9 | RUN apt-get install -y bash 10 | RUN apt-get install -y nginx 11 | RUN apt-get install -y curl 12 | 13 | COPY requirements.txt requirements.txt 14 | RUN pip3 install -r requirements.txt 15 | 16 | COPY . . 17 | 18 | COPY nginx.conf /etc/nginx/sites-available/default 19 | 20 | CMD service nginx start && gunicorn -w 4 -b 0.0.0.0:1250 app:app 21 | # CMD service nginx start && python3 ./app.py 22 | # CMD python3 ./app.py -------------------------------------------------------------------------------- /deployment/llava.Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 4 | #FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 5 | WORKDIR / 6 | RUN apt-get -y update 7 | RUN apt-get -y install libnuma-dev fuse3 libkeyutils-dev libaio-dev 8 | 9 | COPY onenode_logging_config.yaml /opt/inferx/config/onenode_logging_config.yaml 10 | COPY node.json /opt/inferx/config/node.json 11 | COPY libnvmedrv.so /usr/lib/libnvmedrv.so 12 | COPY . . 13 | CMD ["./onenode"] -------------------------------------------------------------------------------- /deployment/one.Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | #FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 4 | FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 5 | WORKDIR / 6 | RUN apt-get -y update 7 | RUN apt-get -y install libnuma-dev 8 | RUN apt-get -y install fuse3 9 | RUN apt-get -y install libkeyutils-dev 10 | RUN apt-get -y install libaio-dev 11 | # RUN apt-get -y install libssl3 12 | RUN apt-get -y install libssl-dev 13 | 14 | COPY onenode_logging_config.yaml /opt/inferx/config/onenode_logging_config.yaml 15 | COPY node.json /opt/inferx/config/node.json 16 | COPY libnvmedrv.so /usr/lib/libnvmedrv.so 17 | COPY . . 18 | CMD ["./onenode"] -------------------------------------------------------------------------------- /deployment/spdk.Dockerfile: -------------------------------------------------------------------------------- 1 | # Use Ubuntu as the base image 2 | FROM ubuntu:22.04 3 | 4 | # Set environment variables 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | # Install dependencies 8 | RUN apt-get update && apt-get install -y \ 9 | build-essential \ 10 | git \ 11 | gcc \ 12 | make \ 13 | libaio-dev \ 14 | libpciaccess-dev \ 15 | python3 \ 16 | python3-pip \ 17 | pciutils \ 18 | pkg-config kmod \ 19 | libjson-c-dev libcunit1-dev libssl-dev libcmocka-dev uuid-dev libiscsi-dev libkeyutils-dev libncurses5-dev libncursesw5-dev unzip libfuse3-dev patchelf \ 20 | python3-configshell-fb python3-pexpect nasm libnuma-dev \ 21 | autoconf automake libtool help2man systemtap-sdt-dev \ 22 | astyle lcov clang sg3-utils shellcheck abigail-tools bash-completion ruby-dev pycodestyle bundler rake python3-paramiko curl \ 23 | libpmem-dev libpmemblk-dev libpmemobj-dev \ 24 | librados-dev librbd-dev libibverbs-dev librdmacm-dev 25 | 26 | # Clone the SPDK repository 27 | RUN git clone https://github.com/spdk/spdk.git /spdk --recursive 28 | 29 | # Set working directory 30 | WORKDIR /spdk 31 | 32 | RUN ./scripts/pkgdep.sh --all 33 | RUN ./configure 34 | RUN make 35 | 36 | # Set up entrypoint to provide SPDK CLI tools 37 | ENTRYPOINT scripts/gen_nvme.sh --json-with-subsystems > /opt/inferx/config/nvme_bdev_all.json && scripts/setup.sh 38 | -------------------------------------------------------------------------------- /deployment/spdk.script: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e # exit immediately if any command fails 3 | 4 | echo "Generating NVMe config..." 5 | scripts/gen_nvme.sh --json-with-subsystems > /opt/inferx/config/nvme_bdev_all.json 6 | sleep 1 7 | echo "Running SPDK setup..." 8 | scripts/setup.sh 9 | sleep 1 10 | scripts/gen_nvme.sh --json-with-subsystems > /opt/inferx/config/nvme_bdev_all.json 11 | sleep 1 12 | scripts/setup.sh 13 | echo "SPDK setup complete." 14 | 15 | while true; do sleep 86400; done 16 | -------------------------------------------------------------------------------- /deployment/spdk2.Dockerfile: -------------------------------------------------------------------------------- 1 | # Use Ubuntu as the base image 2 | FROM inferx/spdk-container:v0.1.0 3 | 4 | # Set environment variables 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | COPY entrypoint.sh /spdk/entrypoint.sh 8 | 9 | # Set working directory 10 | WORKDIR /spdk 11 | 12 | 13 | # Set up entrypoint to provide SPDK CLI tools 14 | ENTRYPOINT bash /spdk/entrypoint.sh 15 | -------------------------------------------------------------------------------- /deployment/vllm-opai.Dockerfile: -------------------------------------------------------------------------------- 1 | # docker build -t vllm-openai-upgraded . 2 | FROM vllm/vllm-openai:v0.7.3 3 | WORKDIR / 4 | # Upgrade the transformers library 5 | RUN apt-get -y update 6 | RUN apt-get install libglib2.0-0 -y 7 | RUN apt-get install libgl1 -y 8 | 9 | RUN pip install --upgrade transformers 10 | RUN pip install --upgrade safetensors 11 | RUN pip install diffusers --upgrade 12 | RUN pip install invisible_watermark accelerate 13 | 14 | COPY run_model.py /usr/lib/run_model.py 15 | COPY run_llava.py /usr/lib/run_llava.py 16 | COPY run_stablediffusion.py /usr/lib/run_stablediffusion.py 17 | 18 | -------------------------------------------------------------------------------- /doc/GPUSnapshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/GPUSnapshot.png -------------------------------------------------------------------------------- /doc/architect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/architect.png -------------------------------------------------------------------------------- /doc/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/comparison.png -------------------------------------------------------------------------------- /doc/daemon.json: -------------------------------------------------------------------------------- 1 | { 2 | "runtimes": { 3 | "nvidia": { 4 | "args": [], 5 | "path": "nvidia-container-runtime" 6 | }, 7 | "inferx": { 8 | "path": "/opt/inferx/bin/inferx" 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /doc/home.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /doc/infer_Profile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/infer_Profile.png -------------------------------------------------------------------------------- /doc/keycloak.md: -------------------------------------------------------------------------------- 1 | 1. Create Realm "Inferx" 2 | 2. Create Client "infer_client" in Realm "Inferx" 3 | a. Enable Client authentication 4 | b. Add Valid redirect URI 5 | https://inferx.net:8000/* 6 | http://:1250/* 7 | http://:81/* 8 | http://:4000/* 9 | c. Add web origins 10 | https://inferx.net:8000 11 | http://:1250 12 | http://:81 13 | http://:4000 14 | d. Enable "Direct Access Grants Enabled" 15 | 3. Update KEYCLOAK_CLIENT_SECRET in docker-compose_blob.yml 16 | 4. Update the KEYCLOAK_URL with local address 17 | 18 | 19 | curl -X POST "http://192.168.0.22:1260/authn/realms/inferx/protocol/openid-connect/token" \ 20 | -H "Content-Type: application/x-www-form-urlencoded" \ 21 | -d "client_id=infer_client" \ 22 | -d "client_secret=M2Dse5531tdtyipZdGizLEeoOVgziQRX" \ 23 | -d "username=testuser1" \ 24 | -d "password=test" \ 25 | -d "grant_type=password" 26 | 27 | -------------------------------------------------------------------------------- /doc/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/logo.png -------------------------------------------------------------------------------- /doc/logo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/logo1.png -------------------------------------------------------------------------------- /doc/logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/logo2.png -------------------------------------------------------------------------------- /inferxlib/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "inferxlib" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | serde = { version = "1.0", features = ["derive"] } 10 | serde_json = "1.0" 11 | regex = "1.7.1" 12 | bollard = "=0.17.0" 13 | reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } 14 | log = "0.4.17" 15 | log4rs = "1" 16 | 17 | [dependencies.lazy_static] 18 | version = "1.0" 19 | features = ["spin_no_std"] 20 | -------------------------------------------------------------------------------- /inferxlib/src/common.rs: -------------------------------------------------------------------------------- 1 | use serde_json::Error as SerdeJsonError; 2 | 3 | pub type Result = core::result::Result; 4 | 5 | #[derive(Debug)] 6 | pub enum Error { 7 | CommonError(String), 8 | NotExist(String), 9 | Exist(String), 10 | SchedulerNoEnoughResource(String), 11 | SerdeJsonError(SerdeJsonError), 12 | StdIOErr(std::io::Error), 13 | ReqWestErr(reqwest::Error), 14 | } 15 | 16 | impl From for Error { 17 | fn from(item: SerdeJsonError) -> Self { 18 | return Self::SerdeJsonError(item); 19 | } 20 | } 21 | 22 | impl From for Error { 23 | fn from(item: std::io::Error) -> Self { 24 | return Self::StdIOErr(item); 25 | } 26 | } 27 | 28 | impl From for Error { 29 | fn from(item: reqwest::Error) -> Self { 30 | return Self::ReqWestErr(item); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /inferxlib/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] 2 | #![allow(non_snake_case)] 3 | #![allow(non_upper_case_globals)] 4 | #![allow(non_camel_case_types)] 5 | #![allow(deprecated)] 6 | #![allow(unused_imports)] 7 | 8 | #[macro_use] 9 | extern crate log; 10 | 11 | pub mod common; 12 | pub mod data_obj; 13 | pub mod node; 14 | pub mod obj_mgr; 15 | pub mod resource; 16 | pub mod selector; 17 | pub mod validation; 18 | -------------------------------------------------------------------------------- /inferxlib/src/obj_mgr/cidrlock.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | use crate::resource::NodeResources; 4 | 5 | use crate::data_obj::*; 6 | 7 | #[derive(Serialize, Deserialize, Debug, Clone, Default)] 8 | pub struct CidrlockSpec {} 9 | -------------------------------------------------------------------------------- /inferxlib/src/obj_mgr/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Quark Container Authors / 2014 The Kubernetes Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod cidrlock; 16 | pub mod func_mgr; 17 | pub mod funcsnapshot_mgr; 18 | pub mod namespace_mgr; 19 | pub mod node_mgr; 20 | pub mod pod_mgr; 21 | pub mod tenant_mgr; 22 | -------------------------------------------------------------------------------- /inferxlib/src/obj_mgr/namespace_mgr.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Quark Container Authors / 2014 The Kubernetes Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use serde::{Deserialize, Serialize}; 16 | 17 | use crate::data_obj::*; 18 | 19 | #[derive(Serialize, Deserialize, Debug, Default, Clone)] 20 | pub struct NamespaceObject { 21 | pub spec: NamespaceSpec, 22 | pub status: NamespaceStatus, 23 | } 24 | 25 | #[derive(Serialize, Deserialize, Debug, Default, Clone)] 26 | pub struct NamespaceStatus { 27 | pub disable: bool, 28 | } 29 | 30 | #[derive(Serialize, Deserialize, Debug, Default, Clone)] 31 | pub struct NamespaceSpec {} 32 | 33 | pub type Namespace = DataObject; 34 | pub type NamespaceMgr = DataObjectMgr; 35 | 36 | impl Namespace { 37 | pub const KEY: &'static str = "namespace"; 38 | } 39 | -------------------------------------------------------------------------------- /inferxlib/src/obj_mgr/node_mgr.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | use crate::resource::NodeResources; 4 | 5 | use crate::data_obj::*; 6 | 7 | #[derive(Serialize, Deserialize, Debug, Clone, Default)] 8 | pub struct NodeSpec { 9 | pub nodeIp: String, 10 | pub podMgrPort: u16, 11 | pub tsotSvcPort: u16, 12 | pub stateSvcPort: u16, 13 | pub cidr: String, 14 | pub resources: NodeResources, 15 | pub blobStoreEnable: bool, 16 | } 17 | 18 | pub type Node = DataObject; 19 | pub type NodeMgr = DataObjectMgr; 20 | 21 | impl Node { 22 | pub const KEY: &'static str = "node_info"; 23 | pub const TENANT: &'static str = "system"; 24 | pub const NAMESPACE: &'static str = "system"; 25 | 26 | pub fn QletUrl(&self) -> String { 27 | return format!("http://{}:{}", self.object.nodeIp, self.object.podMgrPort); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /inferxlib/src/obj_mgr/tenant_mgr.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Quark Container Authors / 2014 The Kubernetes Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use serde::{Deserialize, Serialize}; 16 | 17 | use crate::data_obj::*; 18 | 19 | pub const SYSTEM_TENANT: &str = "system"; 20 | pub const SYSTEM_NAMESPACE: &str = "system"; 21 | 22 | #[derive(Serialize, Deserialize, Debug, Default, Clone)] 23 | pub struct TenantObject { 24 | pub spec: TenantSpec, 25 | pub status: TenantStatus, 26 | } 27 | 28 | #[derive(Serialize, Deserialize, Debug, Default, Clone)] 29 | pub struct TenantStatus { 30 | pub disable: bool, 31 | } 32 | 33 | #[derive(Serialize, Deserialize, Debug, Default, Clone)] 34 | pub struct TenantSpec {} 35 | 36 | pub type Tenant = DataObject; 37 | 38 | impl Tenant { 39 | pub const KEY: &'static str = "tenant"; 40 | } 41 | 42 | pub type TenantMgr = DataObjectMgr; 43 | -------------------------------------------------------------------------------- /ixctl_logging_config.yaml: -------------------------------------------------------------------------------- 1 | appenders: 2 | my_stdout: 3 | kind: console 4 | encoder: 5 | pattern: "{h({d(%Y-%m-%d %H:%M:%S)(utc)} - {l}: {m}{n})}" 6 | my_file_logger: 7 | kind: rolling_file 8 | path: "/opt/inferx/log/ixctl.log" 9 | encoder: 10 | pattern: "{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}" 11 | policy: 12 | trigger: 13 | kind: size 14 | limit: 50mb 15 | roller: 16 | kind: delete 17 | append_logger: 18 | kind: file 19 | path: "/opt/inferx/log/ixctl.log" 20 | append: true 21 | encoder: 22 | pattern: "{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}" 23 | root: 24 | level: info 25 | appenders: 26 | - append_logger 27 | -------------------------------------------------------------------------------- /k8s/clean-k3sagent.sh: -------------------------------------------------------------------------------- 1 | # Stop K3s service if running 2 | sudo systemctl stop k3s-agent || true 3 | 4 | # Run the uninstall script if present 5 | sudo /usr/local/bin/k3s-agent-uninstall.sh || true 6 | 7 | # Clean residual data 8 | sudo rm -rf /etc/rancher/k3s /var/lib/rancher/k3s /var/lib/kubelet /etc/systemd/system/k3s-agent.service /usr/local/bin/k3s* 9 | 10 | # Optionally clean containerd data if used before 11 | # sudo rm -rf /var/lib/containerd 12 | 13 | echo "K3s agent cleanup complete." -------------------------------------------------------------------------------- /k8s/cleanup-k3s.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euxo pipefail 3 | 4 | ### 1. Stop & Uninstall K3s 5 | if command -v k3s-uninstall.sh &> /dev/null; then 6 | sudo /usr/local/bin/k3s-uninstall.sh # Stops k3s, removes services, data, etc. :contentReference[oaicite:0]{index=0} 7 | fi 8 | if command -v k3s-agent-uninstall.sh &> /dev/null; then 9 | sudo /usr/local/bin/k3s-agent-uninstall.sh # Removes agent components on workers :contentReference[oaicite:1]{index=1} 10 | fi 11 | 12 | ### 2. Kill any remaining processes 13 | if command -v k3s-killall.sh &> /dev/null; then 14 | sudo /usr/local/bin/k3s-killall.sh # Kills k3s-related processes, containerd, etc. :contentReference[oaicite:2]{index=2} 15 | fi 16 | 17 | ### 3. Remove leftover dirs and configs 18 | # sudo rm -rf /etc/rancher/k3s /var/lib/rancher/k3s /var/lib/kubelet 19 | # /etc/containerd /var/lib/containerd # Clean containerd and K3s state :contentReference[oaicite:3]{index=3} 20 | 21 | sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/systemd/system/k3s* /var/lib/containerd /etc/cni /opt/cni 22 | 23 | 24 | ### 4. Restart containerd to clear any stuck state 25 | sudo systemctl restart containerd # Ensures containerd is fresh :contentReference[oaicite:4]{index=4} 26 | 27 | echo "✔️ K3s and related components have been fully removed." 28 | -------------------------------------------------------------------------------- /k8s/dashboard.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: inferx-dashboard 5 | labels: 6 | app: inferx-dashboard 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: inferx-dashboard 12 | template: 13 | metadata: 14 | labels: 15 | app: inferx-dashboard 16 | spec: 17 | containers: 18 | - name: inferx-dashboard 19 | image: inferx/inferx_dashboard:v0.1.1 20 | imagePullPolicy: IfNotPresent 21 | env: 22 | - name: KEYCLOAK_URL 23 | value: "http://192.168.0.22:31260/authn" 24 | - name: KEYCLOAK_REALM_NAME 25 | value: "inferx" 26 | - name: KEYCLOAK_CLIENT_ID 27 | value: "infer_client" 28 | - name: KEYCLOAK_CLIENT_SECRET 29 | value: "M2Dse5531tdtyipZdGizLEeoOVgziQRX" 30 | - name: INFERX_APIGW_ADDR 31 | value: "http://nodeagent:4000" 32 | volumeMounts: 33 | - name: cert-volume 34 | mountPath: /etc/letsencrypt/ 35 | livenessProbe: 36 | httpGet: 37 | path: /intro?name=home.md 38 | port: 1250 39 | initialDelaySeconds: 10 40 | periodSeconds: 10 41 | volumes: 42 | - name: cert-volume 43 | hostPath: 44 | path: /etc/letsencrypt/ 45 | --- 46 | apiVersion: v1 47 | kind: Service 48 | metadata: 49 | name: inferx-dashboard 50 | spec: 51 | type: NodePort 52 | selector: 53 | app: inferx-dashboard 54 | ports: 55 | - name: http 56 | port: 1250 57 | targetPort: 1250 58 | nodePort: 31250 -------------------------------------------------------------------------------- /k8s/db-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: db-pvc 5 | spec: 6 | accessModes: 7 | - ReadWriteOnce 8 | resources: 9 | requests: 10 | storage: 1Gi 11 | --- 12 | apiVersion: apps/v1 13 | kind: Deployment 14 | metadata: 15 | name: db 16 | spec: 17 | replicas: 1 18 | selector: 19 | matchLabels: 20 | app: db 21 | template: 22 | metadata: 23 | labels: 24 | app: db 25 | spec: 26 | nodeSelector: 27 | inferx_storage: data 28 | containers: 29 | - name: postgres 30 | image: postgres:14.5 31 | imagePullPolicy: IfNotPresent 32 | env: 33 | - name: POSTGRES_USER 34 | value: audit_user 35 | - name: POSTGRES_PASSWORD 36 | value: "123456" 37 | - name: POSTGRES_DB 38 | value: auditdb 39 | - name: PGDATA 40 | value: /data/postgres 41 | volumeMounts: 42 | - name: db-data 43 | mountPath: /data/postgres 44 | - name: init-sql 45 | mountPath: /docker-entrypoint-initdb.d/db.sql 46 | volumes: 47 | - name: db-data 48 | hostPath: 49 | path: /opt/inferx/data/postgres 50 | type: DirectoryOrCreate 51 | - name: init-sql 52 | hostPath: 53 | path: /opt/inferx/config/create_table.sql 54 | type: File 55 | --- 56 | apiVersion: v1 57 | kind: Service 58 | metadata: 59 | name: db 60 | spec: 61 | selector: 62 | app: db 63 | ports: 64 | - port: 5432 65 | targetPort: 5432 66 | nodePort: 30542 67 | type: NodePort 68 | -------------------------------------------------------------------------------- /k8s/etcd.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: etcd 5 | labels: 6 | app: etcd 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: etcd 12 | template: 13 | metadata: 14 | labels: 15 | app: etcd 16 | spec: 17 | nodeSelector: 18 | inferx_storage: data 19 | containers: 20 | - name: etcd 21 | image: quay.io/coreos/etcd:v3.5.13 22 | imagePullPolicy: IfNotPresent 23 | volumeMounts: 24 | - name: etcd-data 25 | mountPath: /opt/inferx/data/etcd 26 | command: [ "etcd" ] 27 | args: 28 | - "--name=etcd-00" 29 | - "--data-dir=/opt/inferx/data/etcd" 30 | - "--advertise-client-urls=http://etcd-00:2379" 31 | - "--listen-client-urls=http://0.0.0.0:2379" 32 | - "--initial-advertise-peer-urls=http://etcd-00:2380" 33 | - "--listen-peer-urls=http://0.0.0.0:2380" 34 | - "--initial-cluster=etcd-00=http://etcd-00:2380" 35 | volumes: 36 | - name: etcd-data 37 | hostPath: 38 | path: /opt/inferx/data/etcd 39 | type: DirectoryOrCreate 40 | --- 41 | apiVersion: v1 42 | kind: Service 43 | metadata: 44 | name: etcd 45 | spec: 46 | selector: 47 | app: etcd 48 | ports: 49 | - port: 2379 50 | targetPort: 2379 51 | -------------------------------------------------------------------------------- /k8s/ingress.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.k8s.io/v1 2 | kind: Ingress 3 | metadata: 4 | name: inferx-ingress 5 | annotations: 6 | nginx.ingress.kubernetes.io/use-regex: "true" 7 | nginx.ingress.kubernetes.io/rewrite-target: /$1$2 8 | nginx.ingress.kubernetes.io/proxy-buffering: "off" 9 | nginx.ingress.kubernetes.io/proxy-request-buffering: "off" 10 | nginx.ingress.kubernetes.io/proxy-http-version: "1.1" 11 | nginx.ingress.kubernetes.io/proxy-chunked: "on" 12 | spec: 13 | rules: 14 | - http: 15 | paths: 16 | - path: /funccall/ 17 | pathType: Prefix 18 | backend: 19 | service: 20 | name: nodeagent 21 | port: 22 | number: 4000 23 | - path: /authn/ 24 | pathType: Prefix 25 | backend: 26 | service: 27 | name: keycloak 28 | port: 29 | number: 8080 30 | - path: / 31 | pathType: Prefix 32 | backend: 33 | service: 34 | name: inferx-dashboard 35 | port: 36 | number: 1250 37 | ports: 38 | web: 39 | port: 80 40 | hostPort: 80 41 | expose: true 42 | websecure: 43 | port: 443 44 | hostPort: 443 45 | expose: true -------------------------------------------------------------------------------- /k8s/install-k3s.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | 5 | ### 2. Install K3s using Docker runtime 6 | echo "[+] Installing K3s with Docker as container runtime..." 7 | curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--docker --node-external-ip=192.168.0.22" sh - 8 | 9 | echo "[+] Waiting for K3s to be ready..." 10 | sleep 10 11 | kubectl get node 12 | 13 | ### 3. Install Helm (if not installed) 14 | if ! command -v helm &> /dev/null; then 15 | echo "[+] Installing Helm..." 16 | curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash 17 | fi 18 | 19 | ### 4. Add NVIDIA Helm repo 20 | echo "[+] Adding NVIDIA Helm repo..." 21 | helm repo add nvidia https://nvidia.github.io/gpu-operator 22 | helm repo update 23 | 24 | ### 5. Deploy NVIDIA GPU Operator with Docker runtime 25 | echo "[+] Installing NVIDIA GPU Operator..." 26 | export KUBECONFIG=/etc/rancher/k3s/k3s.yam 27 | chmod 555 /etc/rancher/k3s/k3s.yaml 28 | helm install --wait gpu-operator \ 29 | nvidia/gpu-operator \ 30 | -n gpu-operator --create-namespace \ 31 | --set operator.defaultRuntime=docker \ 32 | --set driver.enabled=false \ 33 | --set toolkit.enabled=true 34 | 35 | echo "[✓] K3s with Docker runtime and NVIDIA GPU Operator installed successfully." 36 | -------------------------------------------------------------------------------- /k8s/join-k3sagent.sh: -------------------------------------------------------------------------------- 1 | # On server node 2 | # sudo cat /var/lib/rancher/k3s/server/node-token 3 | # hostname -I # Use internal IP accessible by the joining node 4 | 5 | 6 | sudo /usr/local/bin/k3s-agent-uninstall.sh 7 | sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /var/lib/cni /run/flannel 8 | 9 | 10 | 11 | curl -sfL https://get.k3s.io | K3S_URL=https://192.168.0.22:6443 \ 12 | K3S_TOKEN=K106218814e0f9ea4c0b067750e725aee4a2921804a6867b625abb51b5c11149e9a::server:5401cee22c6fd5315c24574784b8d8a1 \ 13 | INSTALL_K3S_EXEC="--docker --with-node-id" sh - 14 | 15 | sudo k3s agent --docker \ 16 | --server https://192.168.0.22:6443 \ 17 | --token K106218814e0f9ea4c0b067750e725aee4a2921804a6867b625abb51b5c11149e9a::server:5401cee22c6fd5315c24574784b8d8a1 \ 18 | --with-node-id \ 19 | --node-name inferx-agent1 \ 20 | --debug 21 | 22 | 23 | # sudo k3s agent --docker --server https://192.168.0.22:6443 --token K106218814e0f9ea4c0b067750e725aee4a2921804a6867b625abb51b5c11149e9a::server:5401cee22c6fd5315c24574784b8d8a1 --debug 24 | 25 | -------------------------------------------------------------------------------- /k8s/keycloak.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: keycloak 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: keycloak 10 | template: 11 | metadata: 12 | labels: 13 | app: keycloak 14 | spec: 15 | containers: 16 | - name: keycloak 17 | image: quay.io/keycloak/keycloak:latest 18 | imagePullPolicy: IfNotPresent 19 | args: ["start-dev", "--verbose"] 20 | env: 21 | - name: KEYCLOAK_ADMIN 22 | value: admin 23 | - name: KEYCLOAK_ADMIN_PASSWORD 24 | value: admin 25 | - name: KC_DB 26 | value: postgres 27 | - name: KC_DB_URL 28 | value: jdbc:postgresql://keycloak-postgres:5432/keycloak 29 | - name: KC_DB_USERNAME 30 | value: keycloak 31 | - name: KC_DB_PASSWORD 32 | value: "123456" 33 | - name: KC_HTTP_ENABLED 34 | value: "true" 35 | - name: KC_PROXY 36 | value: edge 37 | - name: KC_HOSTNAME_STRICT_HTTPS 38 | value: "false" 39 | - name: KC_HOSTNAME_STRICT 40 | value: "false" 41 | - name: KC_HTTP_RELATIVE_PATH 42 | value: /authn 43 | ports: 44 | - containerPort: 8080 45 | --- 46 | apiVersion: v1 47 | kind: Service 48 | metadata: 49 | name: keycloak 50 | spec: 51 | type: NodePort 52 | selector: 53 | app: keycloak 54 | ports: 55 | - port: 8080 56 | targetPort: 8080 57 | nodePort: 31260 # Can customize between 30000–32767 58 | -------------------------------------------------------------------------------- /k8s/keycloak_postgres.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: keycloak-db-pvc 5 | spec: 6 | accessModes: 7 | - ReadWriteOnce 8 | resources: 9 | requests: 10 | storage: 1Gi 11 | --- 12 | apiVersion: apps/v1 13 | kind: Deployment 14 | metadata: 15 | name: keycloak-postgres 16 | spec: 17 | replicas: 1 18 | selector: 19 | matchLabels: 20 | app: keycloak-postgres 21 | template: 22 | metadata: 23 | labels: 24 | app: keycloak-postgres 25 | spec: 26 | nodeSelector: 27 | inferx_storage: data 28 | containers: 29 | - name: postgres 30 | image: postgres:14.5 31 | imagePullPolicy: IfNotPresent 32 | env: 33 | - name: POSTGRES_USER 34 | value: keycloak 35 | - name: POSTGRES_PASSWORD 36 | value: "123456" 37 | - name: POSTGRES_DB 38 | value: keycloak 39 | - name: PGDATA 40 | value: /data/postgres 41 | ports: 42 | - containerPort: 5432 43 | volumeMounts: 44 | - name: db-data 45 | mountPath: /data/postgres 46 | volumes: 47 | - name: db-data 48 | hostPath: 49 | path: /opt/inferx/data/postgres_keycloak 50 | type: DirectoryOrCreate 51 | --- 52 | apiVersion: v1 53 | kind: Service 54 | metadata: 55 | name: keycloak-postgres 56 | spec: 57 | selector: 58 | app: keycloak-postgres 59 | ports: 60 | - port: 5432 61 | targetPort: 5432 62 | -------------------------------------------------------------------------------- /k8s/nvidia-test.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: nvidia-test 5 | spec: 6 | containers: 7 | - name: cuda-container 8 | image: nvidia/cuda:12.2.0-devel-ubuntu20.04 9 | imagePullPolicy: IfNotPresent 10 | command: ["sleep", "infinity"] 11 | resources: 12 | limits: 13 | nvidia.com/gpu: 1 14 | nodeSelector: 15 | kubernetes.io/hostname: brad-ms-7d46 -------------------------------------------------------------------------------- /k8s/scheduler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: scheduler 5 | labels: 6 | app: scheduler 7 | spec: 8 | replicas: 2 9 | selector: 10 | matchLabels: 11 | app: scheduler 12 | template: 13 | metadata: 14 | labels: 15 | app: scheduler 16 | spec: 17 | hostPID: true 18 | containers: 19 | - name: scheduler 20 | image: inferx/inferx_one:v0.1.1 21 | imagePullPolicy: IfNotPresent 22 | env: 23 | - name: POD_IP 24 | valueFrom: 25 | fieldRef: 26 | fieldPath: status.podIP 27 | - name: RUN_SERVICE 28 | value: "Scheduler" 29 | - name: STATESVC_ADDR 30 | value: "http://statesvc:1237" 31 | volumeMounts: 32 | - mountPath: /opt/inferx/ 33 | name: opt-inferx 34 | command: ["./onenode", "/opt/inferx/config/node.json"] 35 | volumes: 36 | - name: opt-inferx 37 | hostPath: 38 | path: /opt/inferx/ 39 | --- 40 | apiVersion: v1 41 | kind: Service 42 | metadata: 43 | name: scheduler 44 | spec: 45 | type: NodePort 46 | selector: 47 | app: scheduler 48 | ports: 49 | - name: http 50 | port: 1238 51 | targetPort: 1238 52 | -------------------------------------------------------------------------------- /k8s/secretdb.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: secret-db-pvc 5 | spec: 6 | accessModes: 7 | - ReadWriteOnce 8 | resources: 9 | requests: 10 | storage: 1Gi 11 | --- 12 | apiVersion: apps/v1 13 | kind: Deployment 14 | metadata: 15 | name: secret-db 16 | spec: 17 | replicas: 1 18 | selector: 19 | matchLabels: 20 | app: secret-db 21 | template: 22 | metadata: 23 | labels: 24 | app: secret-db 25 | spec: 26 | nodeSelector: 27 | inferx_storage: data 28 | containers: 29 | - name: postgres 30 | image: postgres:14.5 31 | imagePullPolicy: IfNotPresent 32 | ports: 33 | - containerPort: 5432 34 | env: 35 | - name: POSTGRES_USER 36 | value: secret 37 | - name: POSTGRES_PASSWORD 38 | value: "123456" 39 | - name: POSTGRES_DB 40 | value: secretdb 41 | - name: PGDATA 42 | value: /data/postgres 43 | volumeMounts: 44 | - name: db-data 45 | mountPath: /data/postgres 46 | - name: init-sql 47 | mountPath: /docker-entrypoint-initdb.d/db.sql 48 | volumes: 49 | - name: db-data 50 | hostPath: 51 | path: /opt/inferx/data/postgres_secret 52 | type: DirectoryOrCreate 53 | - name: init-sql 54 | hostPath: 55 | path: /opt/inferx/config/secret.sql 56 | type: File 57 | --- 58 | apiVersion: v1 59 | kind: Service 60 | metadata: 61 | name: secret-db 62 | spec: 63 | selector: 64 | app: secret-db 65 | ports: 66 | - port: 5432 67 | targetPort: 5432 68 | nodePort: 30541 69 | type: NodePort 70 | 71 | -------------------------------------------------------------------------------- /k8s/spdk.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: spdk 5 | labels: 6 | app: spdk 7 | spec: 8 | selector: 9 | matchLabels: 10 | app: spdk 11 | template: 12 | metadata: 13 | labels: 14 | app: spdk 15 | spec: 16 | nodeSelector: 17 | inferx_nodeType: inferx_blob 18 | hostNetwork: true 19 | hostPID: true 20 | containers: 21 | - name: spdk 22 | image: inferx/spdk-container2:v0.1.0 23 | imagePullPolicy: IfNotPresent 24 | securityContext: 25 | privileged: true 26 | runAsUser: 0 27 | env: 28 | - name: HUGEMEM 29 | value: "64000" 30 | volumeMounts: 31 | - name: hugepages 32 | mountPath: /dev/hugepages 33 | - name: lib-modules 34 | mountPath: /lib/modules 35 | - name: opt-inferx 36 | mountPath: /opt/inferx 37 | - name: run-udev 38 | mountPath: /run/udev 39 | volumes: 40 | - name: hugepages 41 | hostPath: 42 | path: /dev/hugepages 43 | - name: lib-modules 44 | hostPath: 45 | path: /lib/modules 46 | - name: opt-inferx 47 | hostPath: 48 | path: /opt/inferx 49 | - name: run-udev 50 | hostPath: 51 | path: /run/udev 52 | restartPolicy: Always 53 | tolerations: 54 | - operator: "Exists" # Allow on tainted nodes 55 | -------------------------------------------------------------------------------- /k8s/statesvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: statesvc 5 | labels: 6 | app: statesvc 7 | spec: 8 | replicas: 2 9 | selector: 10 | matchLabels: 11 | app: statesvc 12 | template: 13 | metadata: 14 | labels: 15 | app: statesvc 16 | spec: 17 | hostPID: true 18 | containers: 19 | - name: statesvc 20 | image: inferx/inferx_one:v0.1.1 21 | imagePullPolicy: IfNotPresent 22 | env: 23 | - name: POD_IP 24 | valueFrom: 25 | fieldRef: 26 | fieldPath: status.podIP 27 | - name: RUN_SERVICE 28 | value: "StateSvc" 29 | - name: CACHE_MEMORY 30 | value: 20Gi 31 | volumeMounts: 32 | - mountPath: /opt/inferx/ 33 | name: opt-inferx 34 | command: ["./onenode", "/opt/inferx/config/node.json"] 35 | volumes: 36 | - name: opt-inferx 37 | hostPath: 38 | path: /opt/inferx/ 39 | --- 40 | apiVersion: v1 41 | kind: Service 42 | metadata: 43 | name: statesvc 44 | spec: 45 | type: NodePort 46 | selector: 47 | app: statesvc 48 | ports: 49 | - name: http 50 | port: 1237 51 | targetPort: 1237 52 | -------------------------------------------------------------------------------- /nodeconfig/node.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeName": "node1", 3 | "etcdAddrs": [ 4 | "http://etcd:2379" 5 | ], 6 | "hostIpCidr": "192.168.0.0/16", 7 | "podMgrPort": 1233, 8 | "tsotCniPort": 1234, 9 | "tsotSvcPort": 1235, 10 | "qletStateSvcPort": 1236, 11 | "statSvcPort": 1237, 12 | "schedulerPort": 1238, 13 | "gatewayPort": 4000, 14 | "cidr": "10.1.3.0/8", 15 | "stateSvcAddrs": [ 16 | "http://localhost:1237" 17 | ], 18 | "tsotSocketPath": "/opt/inferx/sockets/tsot-socket", 19 | "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket", 20 | "runService": true, 21 | "auditdbAddr": "postgresql://audit_user:123456@db:5432/auditdb", 22 | "resources": { 23 | "CPU": 30000, 24 | "Mem": 400000, 25 | "GPUs": "Auto", 26 | "ContextOverhead": 450, 27 | "MaxContextPerGPU": 1 28 | }, 29 | "snapshotDir": "/opt/inferx/snapshot", 30 | "enableBlobStore": false, 31 | "sharemem": { 32 | "size": 20, 33 | "hugepage": true 34 | }, 35 | "tlsconfig": { 36 | "enable": false, 37 | "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem", 38 | "key": "/etc/letsencrypt/live/inferx.net/privkey.pem" 39 | }, 40 | "secretStoreAddr": "postgresql://secret:123456@secret-db:5432/secretdb", 41 | "keycloakconfig": { 42 | "url": "http://keycloak:8080/authn", 43 | "realm": "inferx" 44 | } 45 | } -------------------------------------------------------------------------------- /nodeconfig/node1.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeName": "node1", 3 | "etcdAddrs": [ 4 | "http://localhost:2379" 5 | ], 6 | "hostIpCidr": "192.168.0.0/16", 7 | "podMgrPort": 1233, 8 | "tsotCniPort": 1234, 9 | "tsotSvcPort": 1235, 10 | "qletStateSvcPort": 1236, 11 | "statSvcPort": 1237, 12 | "schedulerPort": 1238, 13 | "gatewayPort": 4000, 14 | "cidr": "10.1.3.0/8", 15 | "stateSvcAddrs": [ 16 | "http://localhost:1237" 17 | ], 18 | "tsotSocketPath": "/opt/inferx/sockets/tsot-socket", 19 | "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket", 20 | "runService": true, 21 | "auditdbAddr": "postgresql://audit_user:123456@localhost:5432/auditdb", 22 | "resources": { 23 | "CPU": 30000, 24 | "Mem": 400000, 25 | "GPUType": "A4000", 26 | "GPUs": "Auto", 27 | "ContextOverhead": 450, 28 | "MaxContextPerGPU": 1 29 | }, 30 | "snapshotDir": "/opt/inferx/snapshot", 31 | "enableBlobStore": false, 32 | "sharemem": { 33 | "size": 20, 34 | "hugepage": true 35 | }, 36 | "tlsconfig": { 37 | "enable": false, 38 | "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem", 39 | "key": "/etc/letsencrypt/live/inferx.net/privkey.pem" 40 | }, 41 | "secretStoreAddr": "postgresql://secret:123456@localhost:5431/secretdb", 42 | "keycloakconfig": { 43 | "url": "http://localhost:1260/authn", 44 | "realm": "inferx" 45 | } 46 | } -------------------------------------------------------------------------------- /nodeconfig/node2.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeName": "node2", 3 | "etcdAddrs": [ 4 | "http://localhost:2379" 5 | ], 6 | "hostIpCidr": "192.168.0.0/16", 7 | "podMgrPort": 1233, 8 | "tsotCniPort": 1234, 9 | "tsotSvcPort": 1235, 10 | "qletStateSvcPort": 1236, 11 | "statSvcPort": 1237, 12 | "schedulerPort": 1238, 13 | "gatewayPort": 4000, 14 | "cidr": "10.1.2.0/8", 15 | "stateSvcAddrs": [ 16 | "http://localhost:1237" 17 | ], 18 | "tsotSocketPath": "/var/run/quark/tsot-socket", 19 | "tsotGwSocketPath": "/var/run/quark_host/tsot-socket", 20 | "runService": false, 21 | "auditdbAddr": "postgresql://audit_user:123456@localhost/auditdb", 22 | "resources": { 23 | "CPU": 30000, 24 | "Mem": 300000, 25 | "GPUType": "A4000", 26 | "GPUs": "Auto", 27 | "ContextOverhead": 450, 28 | "MaxContextPerGPU": 2 29 | }, 30 | "snapshotDir": "/snapshot", 31 | "enableBlobStore": true 32 | } -------------------------------------------------------------------------------- /nodeconfig/node3.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeName": "node2", 3 | "etcdAddrs": [ 4 | "http://localhost:2379" 5 | ], 6 | "hostIpCidr": "192.168.0.0/16", 7 | "podMgrPort": 1233, 8 | "tsotCniPort": 1234, 9 | "tsotSvcPort": 1235, 10 | "qletStateSvcPort": 1236, 11 | "statSvcPort": 1237, 12 | "schedulerPort": 1238, 13 | "gatewayPort": 4000, 14 | "cidr": "10.1.2.0/8", 15 | "stateSvcAddrs": [ 16 | "http://localhost:1237" 17 | ], 18 | "tsotSocketPath": "/opt/inferx/sockets/tsot-socket", 19 | "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket", 20 | "runService": true, 21 | "auditdbAddr": "postgresql://audit_user:123456@localhost:5432/auditdb", 22 | "resources": { 23 | "CPU": 30000, 24 | "Mem": 400000, 25 | "GPUType": "A4000", 26 | "GPUs": "Auto", 27 | "ContextOverhead": 440, 28 | "MaxContextPerGPU": 1 29 | }, 30 | "snapshotDir": "/opt/inferx/snapshot", 31 | "enableBlobStore": true, 32 | "sharemem": { 33 | "size": 36, 34 | "hugepage": true 35 | }, 36 | "tlsconfig": { 37 | "enable": false, 38 | "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem", 39 | "key": "/etc/letsencrypt/live/inferx.net/privkey.pem" 40 | }, 41 | "secretStoreAddr": "postgresql://secret:123456@localhost:5431/secretdb", 42 | "keycloakconfig": { 43 | "url": "http://localhost:1260/authn", 44 | "realm": "inferx" 45 | } 46 | } -------------------------------------------------------------------------------- /nodeconfig/node4.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeName": "node3", 3 | "etcdAddrs": [ 4 | "http://localhost:2379" 5 | ], 6 | "hostIpCidr": "192.168.0.0/16", 7 | "podMgrPort": 1233, 8 | "tsotCniPort": 1234, 9 | "tsotSvcPort": 1235, 10 | "qletStateSvcPort": 1236, 11 | "statSvcPort": 1237, 12 | "schedulerPort": 1238, 13 | "gatewayPort": 4000, 14 | "cidr": "10.1.2.0/8", 15 | "stateSvcAddrs": [ 16 | "http://localhost:1237" 17 | ], 18 | "tsotSocketPath": "/opt/inferx/sockets/tsot-socket", 19 | "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket", 20 | "runService": true, 21 | "auditdbAddr": "postgresql://audit_user:123456@localhost:30542/auditdb", 22 | "resources": { 23 | "CPU": 30000, 24 | "Mem": 400000, 25 | "GPUType": "A4000", 26 | "GPUs": "Auto", 27 | "ContextOverhead": 450, 28 | "MaxContextPerGPU": 2 29 | }, 30 | "snapshotDir": "/opt/inferx/snapshot", 31 | "enableBlobStore": true, 32 | "sharemem": { 33 | "size": 36, 34 | "hugepage": true 35 | }, 36 | "tlsconfig": { 37 | "enable": false, 38 | "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem", 39 | "key": "/etc/letsencrypt/live/inferx.net/privkey.pem" 40 | }, 41 | "secretStoreAddr": "postgresql://secret:123456@localhost:30541/secretdb", 42 | "keycloakconfig": { 43 | "url": "http://localhost:31260", 44 | "realm": "inferx", 45 | "adminUser": "admin", 46 | "adminPassword": "admin" 47 | } 48 | } -------------------------------------------------------------------------------- /nodeconfig/node_blob.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeName": "node1", 3 | "etcdAddrs": [ 4 | "http://etcd:2379" 5 | ], 6 | "hostIpCidr": "192.168.0.0/16", 7 | "podMgrPort": 1233, 8 | "tsotCniPort": 1234, 9 | "tsotSvcPort": 1235, 10 | "qletStateSvcPort": 1236, 11 | "statSvcPort": 1237, 12 | "schedulerPort": 1238, 13 | "gatewayPort": 4000, 14 | "cidr": "10.1.3.0/8", 15 | "stateSvcAddrs": [ 16 | "http://localhost:1237" 17 | ], 18 | "tsotSocketPath": "/opt/inferx/sockets/tsot-socket", 19 | "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket", 20 | "runService": true, 21 | "auditdbAddr": "postgresql://audit_user:123456@db:5432/auditdb", 22 | "resources": { 23 | "CPU": 30000, 24 | "Mem": 400000, 25 | "GPUType": "A4000", 26 | "GPUs": "Auto", 27 | "ContextOverhead": 450, 28 | "MaxContextPerGPU": 1 29 | }, 30 | "snapshotDir": "/opt/inferx/snapshot", 31 | "enableBlobStore": true, 32 | "sharemem": { 33 | "size": 50, 34 | "hugepage": true 35 | }, 36 | "tlsconfig": { 37 | "enable": false, 38 | "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem", 39 | "key": "/etc/letsencrypt/live/inferx.net/privkey.pem" 40 | }, 41 | "secretStoreAddr": "postgresql://secret:123456@secret-db:5432/secretdb", 42 | "keycloakconfig": { 43 | "url": "http://keycloak:8080/authn", 44 | "realm": "inferx" 45 | } 46 | } -------------------------------------------------------------------------------- /script/inferx_clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PARENT_DIR="/opt/inferx/sandbox/" 4 | INFERX_BIN="/opt/inferx/bin/inferx" 5 | 6 | # pkill -9 inferx 7 | 8 | for SUBDIR in "$PARENT_DIR"/*; do 9 | if [ -d "$SUBDIR" ]; then 10 | SUBFOLDER_NAME=$(basename "$SUBDIR") 11 | echo "Running inferx on: $SUBFOLDER_NAME" 12 | "$INFERX_BIN" \ 13 | --root "/var/run/docker/runtime-runc/moby" \ 14 | --log-format json \ 15 | --systemd-cgroup delete "$SUBFOLDER_NAME" 16 | 17 | fi 18 | done --------------------------------------------------------------------------------