├── .gitignore
├── Cargo.toml
├── LICENSE
├── Makefile
├── README.md
├── config
├── Aquila-7B.json
├── BAAI_namespace.json
├── Baichuan-7B.json
├── Baichuan2-13B-Chat-4bits.json
├── Baichuan2-7B-Chat.json
├── DeciLM-7B-instruct.json
├── DeciLM-7B.json
├── Deci_namespace.json
├── DeepSeek-R1-Distill-Llama-8B.json
├── DeepSeek-R1-Distill-Qwen-1.5B.json
├── DeepSeek-R1-Distill-Qwen-7B.json
├── EXAONE-3.0-7.8B-Instruct copy.json
├── EXAONE-3.0-7.8B-Instruct.json
├── EleutherAI_namespace.json
├── Llama-2-13b-hf.json
├── Llama-3.2-3B-Instruct.json
├── Llama-3.2-3B-Instruct_2gpu.json
├── Meta-Llama-3-8B-Instruct.json
├── Meta-Llama-3-8B.json
├── MiniCPM-2B-dpo-bf16.json
├── MiniCPM-2B-sft-bf16.json
├── MiniCPM3-4B.json
├── Minitron-8B-Base.json
├── Mistral-7B-Instruct-v0.1.json
├── Mistral-7B-v0.1.json
├── Mistral-7B-v0.1_2gpu.json
├── OLMo-1B-hf.json
├── OLMo-1B-hf_2gpu.json
├── OLMo-7B-hf.json
├── OLMoE-1B-7B-0924-Instruct.json
├── OLMoE-1B-7B-0924.json
├── OpenAssistant_namespace.json
├── Phi-3-mini-128k-instruct.json
├── Phi-3-mini-4k-instruct.json
├── Qwen-VL-Chat.json
├── Qwen.json
├── Qwen1.5-MoE-A2.7B.json
├── Qwen2.5-1.5B.json
├── Qwen2.5-7B-Instruct-1M.json
├── Qwen2.5-7B-Instruct-GPTQ-Int8.json
├── Qwen2.5-7B.json
├── Qwen2.5-Coder-1.5B-Instruct.json
├── Qwen2.5-Coder-14B-Instruct-GPTQ-Int8.json
├── Qwen2.5-Coder-3B.json
├── Qwen2.5-Coder-7B-Instruct.json
├── Qwen2.5-Math-1.5B-Instruct.json
├── Qwen2.5-Math-1.5B.json
├── Qwen2.5-Math-7B-Instruct.json
├── Qwen2.5-Math-7B.json
├── Qwen7BInt8.json
├── Qwen_namespace.json
├── Salesforce_namespace.json
├── THUDM_namespace.json
├── TinyLlama-1.1B-Chat-v1.0.json
├── TinyLlama-1.1B-Chat-v1.0_13GB.json
├── TinyLlama-1.1B-Chat-v1.0_2gpu.json
├── TinyLlama-1.1B-Chat-v1.0_test.json
├── TinyLlama_namespace.json
├── XVERSE-13B-Chat.json
├── XVERSE-7B-Chat.json
├── allenai_namespace.json
├── baichuan-inc_namespace.json
├── bigcode_namespace.json
├── chatglm3-6b-128k.json
├── chatglm3-6b-32k.json
├── chatglm3-6b.json
├── codegen-2B-multi.json
├── core42_jais-13b-bnb-4bit.json
├── core42_jais-13b-chat-bnb-4bit.json
├── databricks_namespace.json
├── deepseek-ai_namespace.json
├── deepseek-llm-7b-chat.json
├── deepseek-llm-7b-chat_2gpu.json
├── deepseek-math-7b-instruct.json
├── deepseek-math-7b-rl.json
├── deepseek-vl2-tiny.json
├── dolly-v2-12b.json
├── facebook_namespace.json
├── falcon-7b.json
├── falcon-rw-7b.json
├── gemma-7b.json
├── gpt-j-6b.json
├── gpt2-xl.json
├── gpt4all-j.json
├── internlm2-7b.json
├── internlm2_5-7b-chat.json
├── llama_8BInt8.json
├── llava-1.5-7b-hf.json
├── llava-hf_namespace.json
├── mamba-1.4b-hf.json
├── mamba-2.8b-hf.json
├── meta-llama_namespace.json
├── microsoft_namespace.json
├── mistral.json
├── mistralai_namespace.json
├── models.txt
├── mosaicml_namespace.json
├── mpt-7b-storywriter.json
├── mpt-7b.json
├── namespace1.json
├── nomic-ai_namespace.json
├── ns1_namespace.json
├── oasst-sft-4-pythia-12b-epoch-3.5.json
├── openai-community_namespace.json
├── openbmb_namespace.json
├── opt-iml-max-1.3b.json
├── persimmon-8b-base.json
├── persimmon-8b-chat.json
├── public.json
├── pythia-12b.json
├── reader.json
├── stabilityai_namespace.json
├── stable-diffusion-xl-base-1.0.json
├── stablelm-3b-4e1t.json
├── stablelm-tuned-alpha-7b.json
├── starcoder2-3b.json
├── starcoder2-7b.json
├── state-spaces_namespace.json
├── tenant1.json
└── tiiuae_namespace.json
├── dashboard
├── Makefile
├── __pycache__
│ ├── na_pb2.cpython-38.pyc
│ ├── na_pb2_grpc.cpython-38.pyc
│ ├── qobjs_pb2.cpython-38.pyc
│ └── qobjs_pb2_grpc.cpython-38.pyc
├── app.py
├── client.py
├── doc
├── gunicorn.conf.py
├── na_pb2.py
├── na_pb2_grpc.py
├── nginx.conf
├── qobjs_pb2.py
├── qobjs_pb2_grpc.py
├── requirements.txt
├── sql
│ ├── audit.sql
│ ├── create_table.sql
│ ├── kv.sql
│ └── secret.sql
├── static
│ └── button.gif
└── templates
│ ├── admin.html
│ ├── base.html
│ ├── func.html
│ ├── func_list.html
│ ├── index.html
│ ├── log.html
│ ├── markdown.html
│ ├── node.html
│ ├── node_list.html
│ ├── pod.html
│ ├── pod_list.html
│ └── snapshot_list.html
├── deployment
├── dashboard.Dockerfile
├── llava.Dockerfile
├── one.Dockerfile
├── spdk.Dockerfile
├── spdk.script
├── spdk2.Dockerfile
└── vllm-opai.Dockerfile
├── doc
├── GPUSnapshot.png
├── architect.png
├── comparison.png
├── daemon.json
├── home.md
├── infer_Profile.png
├── keycloak.md
├── logo.png
├── logo1.png
└── logo2.png
├── docker-compose.yml
├── docker-compose_blob.yml
├── inferx-realm.json
├── inferxlib
├── Cargo.toml
└── src
│ ├── common.rs
│ ├── data_obj.rs
│ ├── lib.rs
│ ├── node.rs
│ ├── obj_mgr
│ ├── cidrlock.rs
│ ├── func_mgr.rs
│ ├── funcsnapshot_mgr.rs
│ ├── mod.rs
│ ├── namespace_mgr.rs
│ ├── node_mgr.rs
│ ├── pod_mgr.rs
│ └── tenant_mgr.rs
│ ├── resource.rs
│ ├── selector.rs
│ └── validation.rs
├── ixctl
├── command.rs
├── create.rs
├── delete.rs
├── get.rs
├── list.rs
├── main.rs
├── object_client.rs
└── update.rs
├── ixctl_logging_config.yaml
├── k8s
├── clean-k3sagent.sh
├── cleanup-k3s.sh
├── dashboard.yaml
├── db-deployment.yaml
├── etcd.yaml
├── inferx_one.yaml
├── inferx_one_blob.yaml
├── ingress.yaml
├── install-k3s.sh
├── join-k3sagent.sh
├── keycloak.yaml
├── keycloak_postgres.yaml
├── nodeagent.yaml
├── nvidia-test.yaml
├── scheduler.yaml
├── secretdb.yaml
├── spdk.yaml
└── statesvc.yaml
├── nodeconfig
├── node.json
├── node1.json
├── node2.json
├── node3.json
├── node4.json
└── node_blob.json
└── script
├── inferx_clean.sh
├── run_llava.py
├── run_model.py
└── run_stablediffusion.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Generated by Cargo
2 | # will have compiled files and executables
3 | debug/
4 | target/
5 |
6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
8 | Cargo.lock
9 |
10 | # These are backup files generated by rustfmt
11 | **/*.rs.bk
12 |
13 | # MSVC Windows builds of rustc generate these, which store debugging information
14 | *.pdb
15 |
16 | # RustRover
17 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
18 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
19 | # and can be added to the global gitignore or merged into this file. For a more nuclear
20 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
21 | #.idea/
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "qservice"
3 | version = "0.1.0"
4 | edition = "2021"
5 |
6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7 |
8 | [dependencies]
9 | inferxlib = { path = "./inferxlib" }
10 |
11 | libc = "0.2.94"
12 | tokio = { version = "1.25", features = ["full"] }
13 | tokio-stream = { version = "0.1", features = ["net"] }
14 | tonic = { version = "0.8" }
15 | hostname = "^0.3"
16 | rand = "0.8.5"
17 | serde = { version = "1.0", features = ["derive"] }
18 | serde_json = "1.0"
19 | serde_derive = "1.0"
20 | regex = "1.7.1"
21 | reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
22 | chrono = "0.4.24"
23 | tower = "0.4.13"
24 | k8s-openapi = { version = "0.18.0", features = ["v1_26"] }
25 | simple-logging = "2.0.2"
26 | log = "0.4.17"
27 | log4rs = "1"
28 | const_format = "0.2.30"
29 | local-ip-address = "0.5.1"
30 | once_cell = "1.17.1"
31 | ipnetwork = "0.20.0"
32 | scopeguard = { version = "^1.1.0", default-features = false }
33 | errno = "0.2.4"
34 | nix = "0.23.1"
35 | futures = "0.3"
36 | dns-lookup = "2.0.4"
37 | #clap = "4.5.9"
38 | clap = "2.33.3"
39 | oauth2 = "4.0"
40 |
41 | axum = "0.7.4"
42 | hyper = { version = "1.3.1", features = ["full"] }
43 | hyper-util = { version = "0.1.3", features = ["full"] }
44 | http-body-util = "0.1"
45 | backtrace = "0.3.74"
46 |
47 | [dependencies.lazy_static]
48 | version = "1.0"
49 | features = ["spin_no_std"]
50 |
51 | [dependencies.uuid]
52 | version = "1.3.1"
53 | features = [
54 | "v4", # Lets you generate random UUIDs
55 | "fast-rng", # Use a faster (but still sufficiently random) RNG
56 | "macro-diagnostics", # Enable better diagnostics for compile-time UUIDs
57 | ]
58 |
59 | [[bin]]
60 | name = "ixctl"
61 | path = "ixctl/main.rs"
62 |
--------------------------------------------------------------------------------
/config/Aquila-7B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "BAAI",
5 | "name": "Aquila-7B",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "BAAI/Aquila-7B",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 60000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13000
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ],
32 | [
33 | "VLLM_CUDART_SO_PATH",
34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 | ]
36 | ],
37 | "mounts": [
38 | {
39 | "hostpath": "/home/brad/cache",
40 | "mountpath": "/root/.cache/huggingface"
41 | }
42 | ],
43 | "endpoint": {
44 | "port": 8000,
45 | "schema": "Http",
46 | "probe": "/health"
47 | },
48 | "sample_query": {
49 | "apiType": "openai",
50 | "prompt": "Here is a recipe for vegan banana bread:",
51 | "path": "v1/completions",
52 | "body": {
53 | "model": "BAAI/Aquila-7B",
54 | "max_tokens": "1000",
55 | "temperature": "0",
56 | "stream": "true"
57 | }
58 | },
59 | "standby": {
60 | "gpu": "Blob",
61 | "pageable": "Blob",
62 | "pinned": "Blob"
63 | }
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/config/BAAI_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "BAAI",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/Baichuan-7B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "baichuan-inc",
5 | "name": "Baichuan-7B",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "baichuan-inc/Baichuan-7B",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "1200",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 60000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13800
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ],
32 | [
33 | "VLLM_CUDART_SO_PATH",
34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 | ]
36 | ],
37 | "mounts": [
38 | {
39 | "hostpath": "/home/brad/cache",
40 | "mountpath": "/root/.cache/huggingface"
41 | }
42 | ],
43 | "endpoint": {
44 | "port": 8000,
45 | "schema": "Http",
46 | "probe": "/health"
47 | },
48 | "sample_query": {
49 | "apiType": "openai",
50 | "prompt": "Give me a short introduction to large language model.",
51 | "path": "v1/completions",
52 | "body": {
53 | "model": "baichuan-inc/Baichuan-7B",
54 | "max_tokens": "1000",
55 | "temperature": "0",
56 | "stream": "true"
57 | }
58 | },
59 | "standby": {
60 | "gpu": "Blob",
61 | "pageable": "Blob",
62 | "pinned": "Blob"
63 | }
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/config/Baichuan2-13B-Chat-4bits.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "baichuan-inc",
5 | "name": "Baichuan2-13B-Chat-4bits",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "baichuan-inc/Baichuan2-13B-Chat-4bits",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000",
15 | "--trust-remote-code"
16 | ],
17 | "resources": {
18 | "CPU": 12000,
19 | "Mem": 24000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 13800
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ],
31 | [
32 | "VLLM_CUDART_SO_PATH",
33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 | ]
35 | ],
36 | "mounts": [
37 | {
38 | "hostpath": "/home/brad/cache",
39 | "mountpath": "/root/.cache/huggingface"
40 | }
41 | ],
42 | "endpoint": {
43 | "port": 8000,
44 | "schema": "Http",
45 | "probe": "/health"
46 | },
47 | "sample_query": {
48 | "apiType": "openai",
49 | "prompt": "解释一下'温故而知新'",
50 | "path": "v1/completions",
51 | "body": {
52 | "model": "baichuan-inc/Baichuan2-13B-Chat-4bits",
53 | "max_tokens": "1000",
54 | "temperature": "0",
55 | "stream": "true"
56 | }
57 | },
58 | "standby": {
59 | "gpu": "Blob",
60 | "pageable": "Blob",
61 | "pinned": "Blob"
62 | }
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/config/DeciLM-7B-instruct.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Deci",
5 | "name": "DeciLM-7B-instruct",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Deci/DeciLM-7B-instruct",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13000
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "port": 8000,
41 | "schema": "Http",
42 | "probe": "/health"
43 | },
44 | "sample_query": {
45 | "apiType": "openai",
46 | "prompt": "Here is a recipe for vegan banana bread:",
47 | "path": "v1/completions",
48 | "body": {
49 | "model": "Deci/DeciLM-7B-instruct",
50 | "max_tokens": "1000",
51 | "temperature": "0",
52 | "stream": "true"
53 | }
54 | },
55 | "standby": {
56 | "gpu": "Blob",
57 | "pageable": "Blob",
58 | "pinned": "Blob"
59 | }
60 | }
61 | }
62 | }
--------------------------------------------------------------------------------
/config/DeciLM-7B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Deci",
5 | "name": "DeciLM-7B",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Deci/DeciLM-7B",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "1200",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13000
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "port": 8000,
41 | "schema": "Http",
42 | "probe": "/health"
43 | },
44 | "sample_query": {
45 | "apiType": "openai",
46 | "prompt": "Here is a recipe for vegan banana bread:",
47 | "path": "v1/completions",
48 | "body": {
49 | "model": "Deci/DeciLM-7B",
50 | "max_tokens": "1000",
51 | "temperature": "0",
52 | "stream": "true"
53 | }
54 | },
55 | "standby": {
56 | "gpu": "Blob",
57 | "pageable": "Blob",
58 | "pinned": "Blob"
59 | }
60 | }
61 | }
62 | }
--------------------------------------------------------------------------------
/config/Deci_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "Deci",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/EXAONE-3.0-7.8B-Instruct copy.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "gemma-7b",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "google/gemma-7b",
11 | "--enforce-eager",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 6000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 15000
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "path": "/v1/completions",
41 | "port": 8000,
42 | "schema": "Http"
43 | },
44 | "probe": {
45 | "path": "/health",
46 | "port": 8000,
47 | "schema": "Http"
48 | },
49 | "api_type": {
50 | "openai": {
51 | "name": "google/gemma-7b",
52 | "max_tokens": 1000,
53 | "temperature": 0
54 | }
55 | },
56 | "keepalive": "Blob"
57 | }
58 | }
--------------------------------------------------------------------------------
/config/EXAONE-3.0-7.8B-Instruct.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "EXAONE-3.0-7.8B-Instruct",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
11 | "--enforce-eager",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 6000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 15000
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "path": "/v1/completions",
41 | "port": 8000,
42 | "schema": "Http"
43 | },
44 | "probe": {
45 | "path": "/health",
46 | "port": 8000,
47 | "schema": "Http"
48 | },
49 | "api_type": {
50 | "openai": {
51 | "name": "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
52 | "max_tokens": 1000,
53 | "temperature": 0
54 | }
55 | },
56 | "keepalive": "Blob"
57 | }
58 | }
--------------------------------------------------------------------------------
/config/EleutherAI_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "EleutherAI",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/Llama-3.2-3B-Instruct.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "meta-llama",
5 | "name": "Llama-3.2-3B-Instruct",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "meta-llama/Llama-3.2-3B-Instruct",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "200"
16 | ],
17 | "resources": {
18 | "CPU": 20000,
19 | "Mem": 50000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 14600
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ],
31 | [
32 | "VLLM_CUDART_SO_PATH",
33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 | ]
35 | ],
36 | "mounts": [
37 | {
38 | "hostpath": "/home/brad/cache",
39 | "mountpath": "/root/.cache/huggingface"
40 | }
41 | ],
42 | "endpoint": {
43 | "port": 8000,
44 | "schema": "Http",
45 | "probe": "/health"
46 | },
47 | "sample_query": {
48 | "apiType": "openai",
49 | "prompt": "def print_hello_world():",
50 | "path": "v1/completions",
51 | "body": {
52 | "model": "meta-llama/Llama-3.2-3B-Instruct",
53 | "max_tokens": "120",
54 | "temperature": "0",
55 | "stream": "true"
56 | }
57 | },
58 | "standby": {
59 | "gpu": "Blob",
60 | "pageable": "Blob",
61 | "pinned": "Blob"
62 | }
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/config/Llama-3.2-3B-Instruct_2gpu.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "meta-llama",
5 | "name": "Llama-3.2-3B-Instruct_2gpu",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "meta-llama/Llama-3.2-3B-Instruct",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "1000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 14600
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "port": 8000,
41 | "schema": "Http",
42 | "probe": "/health"
43 | },
44 | "sample_query": {
45 | "apiType": "openai",
46 | "prompt": "def print_hello_world():",
47 | "path": "v1/completions",
48 | "body": {
49 | "model": "meta-llama/Llama-3.2-3B-Instruct",
50 | "max_tokens": "120",
51 | "temperature": "0",
52 | "stream": "true"
53 | }
54 | },
55 | "standby": {
56 | "gpu": "Blob",
57 | "pageable": "Blob",
58 | "pinned": "Blob"
59 | }
60 | }
61 | }
62 | }
--------------------------------------------------------------------------------
/config/Meta-Llama-3-8B-Instruct.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "Meta-Llama-3-8B-Instruct",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "meta-llama/Meta-Llama-3-8B-Instruct",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "meta-llama/Meta-Llama-3-8B-Instruct",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/Meta-Llama-3-8B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "Meta-Llama-3-8B",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "meta-llama/Meta-Llama-3-8B",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "meta-llama/Meta-Llama-3-8B",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/MiniCPM-2B-dpo-bf16.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "openbmb",
5 | "name": "MiniCPM-2B-dpo-bf16",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "openbmb/MiniCPM-2B-dpo-bf16",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000"
16 | ],
17 | "resources": {
18 | "CPU": 12000,
19 | "Mem": 28000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 13800
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ],
31 | [
32 | "VLLM_CUDART_SO_PATH",
33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 | ]
35 | ],
36 | "mounts": [
37 | {
38 | "hostpath": "/home/brad/cache",
39 | "mountpath": "/root/.cache/huggingface"
40 | }
41 | ],
42 | "endpoint": {
43 | "port": 8000,
44 | "schema": "Http",
45 | "probe": "/health"
46 | },
47 | "sample_query": {
48 | "apiType": "openai",
49 | "prompt": "Give me a short introduction to large language model.",
50 | "path": "v1/completions",
51 | "body": {
52 | "model": "openbmb/MiniCPM-2B-dpo-bf16",
53 | "max_tokens": "1000",
54 | "temperature": "0",
55 | "stream": "true"
56 | }
57 | },
58 | "standby": {
59 | "gpu": "Blob",
60 | "pageable": "Blob",
61 | "pinned": "Blob"
62 | }
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/config/MiniCPM-2B-sft-bf16.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "openbmb",
5 | "name": "MiniCPM-2B-sft-bf16",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "openbmb/MiniCPM-2B-sft-bf16",
12 | "--trust-remote-code",
13 | "--max-model-len",
14 | "1200"
15 | ],
16 | "resources": {
17 | "CPU": 12000,
18 | "Mem": 24000,
19 | "GPU": {
20 | "Type": "Any",
21 | "Count": 1,
22 | "vRam": 9000
23 | }
24 | },
25 | "envs": [
26 | [
27 | "LD_LIBRARY_PATH",
28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 | ],
30 | [
31 | "VLLM_CUDART_SO_PATH",
32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 | ]
34 | ],
35 | "mounts": [
36 | {
37 | "hostpath": "/home/brad/cache",
38 | "mountpath": "/root/.cache/huggingface"
39 | }
40 | ],
41 | "endpoint": {
42 | "port": 8000,
43 | "schema": "Http",
44 | "probe": "/health"
45 | },
46 | "sample_query": {
47 | "apiType": "openai",
48 | "prompt": "Give me a short introduction to large language model.",
49 | "path": "v1/completions",
50 | "body": {
51 | "model": "openbmb/MiniCPM-2B-sft-bf16",
52 | "max_tokens": "1000",
53 | "temperature": "0",
54 | "stream": "true"
55 | }
56 | },
57 | "standby": {
58 | "gpu": "Blob",
59 | "pageable": "Blob",
60 | "pinned": "Blob"
61 | }
62 | }
63 | }
64 | }
--------------------------------------------------------------------------------
/config/MiniCPM3-4B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "openbmb",
5 | "name": "MiniCPM3-4B",
6 | "object": {
7 | "spec": {
8 | "image": "vllm-openai-upgraded:v.0.1",
9 | "commands": [
10 | "--model",
11 | "openbmb/MiniCPM3-4B",
12 | "--enforce-eager",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "200"
16 | ],
17 | "resources": {
18 | "CPU": 12000,
19 | "Mem": 24000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 9000
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "port": 8000,
40 | "schema": "Http",
41 | "probe": "/health"
42 | },
43 | "sample_query": {
44 | "apiType": "openai",
45 | "prompt": "推荐5个北京的景点。",
46 | "path": "v1/completions",
47 | "body": {
48 | "model": "openbmb/MiniCPM3-4B",
49 | "max_tokens": "100",
50 | "temperature": "0",
51 | "stream": "true"
52 | }
53 | },
54 | "standby": {
55 | "gpu": "Blob",
56 | "pageable": "Blob",
57 | "pinned": "Blob"
58 | }
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/config/Minitron-8B-Base.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "Minitron-8B-Base",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "nvidia/Minitron-8B-Base",
11 | "--enforce-eager",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 6000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13800
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/Quark/target/debug/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "path": "/v1/completions",
41 | "port": 8000,
42 | "schema": "Http"
43 | },
44 | "probe": {
45 | "path": "/health",
46 | "port": 8000,
47 | "schema": "Http"
48 | },
49 | "api_type": {
50 | "openai": {
51 | "name": "nvidia/Minitron-8B-Base",
52 | "max_tokens": 1000,
53 | "temperature": 0
54 | }
55 | },
56 | "keepalive": "Blob"
57 | }
58 | }
--------------------------------------------------------------------------------
/config/Mistral-7B-Instruct-v0.1.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "Mistral-7B-Instruct-v0.1",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "mistralai/Mistral-7B-Instruct-v0.1",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "mistralai/Mistral-7B-Instruct-v0.1",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/Mistral-7B-v0.1.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "mistralai",
5 | "name": "Mistral-7B-v0.1",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "mistralai/Mistral-7B-v0.1",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--gpu-memory-utilization",
15 | "0.99",
16 | "--max-model-len",
17 | "200"
18 | ],
19 | "resources": {
20 | "CPU": 20000,
21 | "Mem": 30000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 1,
25 | "vRam": 14800
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "port": 8000,
42 | "schema": "Http",
43 | "probe": "/health"
44 | },
45 | "sample_query": {
46 | "apiType": "openai",
47 | "prompt": "I like traveling by train because",
48 | "path": "v1/completions",
49 | "body": {
50 | "model": "mistralai/Mistral-7B-v0.1",
51 | "max_tokens": "180",
52 | "temperature": "0",
53 | "stream": "true"
54 | }
55 | },
56 | "standby": {
57 | "gpu": "Blob",
58 | "pageable": "Blob",
59 | "pinned": "Blob"
60 | }
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/config/OLMo-1B-hf.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "allenai",
5 | "name": "OLMo-1B-hf",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "allenai/OLMo-1B-hf",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000"
15 | ],
16 | "resources": {
17 | "CPU": 12000,
18 | "Mem": 50000,
19 | "GPU": {
20 | "Type": "Any",
21 | "Count": 1,
22 | "vRam": 14600
23 | }
24 | },
25 | "envs": [
26 | [
27 | "LD_LIBRARY_PATH",
28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 | ],
30 | [
31 | "VLLM_CUDART_SO_PATH",
32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 | ]
34 | ],
35 | "mounts": [
36 | {
37 | "hostpath": "/home/brad/cache",
38 | "mountpath": "/root/.cache/huggingface"
39 | }
40 | ],
41 | "endpoint": {
42 | "port": 8000,
43 | "schema": "Http",
44 | "probe": "/health"
45 | },
46 | "sample_query": {
47 | "apiType": "openai",
48 | "prompt": "What is the capital of USA?",
49 | "path": "v1/completions",
50 | "body": {
51 | "model": "allenai/OLMo-1B-hf",
52 | "max_tokens": "1000",
53 | "temperature": "0",
54 | "stream": "true"
55 | }
56 | },
57 | "standby": {
58 | "gpu": "Blob",
59 | "pageable": "Blob",
60 | "pinned": "Blob"
61 | }
62 | }
63 | }
64 | }
--------------------------------------------------------------------------------
/config/OLMo-1B-hf_2gpu.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "allenai",
5 | "name": "OLMo-1B-hf_2gpu",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "allenai/OLMo-1B-hf",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000",
15 | "--tensor-parallel-size=2"
16 | ],
17 | "resources": {
18 | "CPU": 12000,
19 | "Mem": 50000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 2,
23 | "vRam": 14600
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "port": 8000,
40 | "schema": "Http",
41 | "probe": "/health"
42 | },
43 | "sample_query": {
44 | "apiType": "openai",
45 | "prompt": "What is the capital of USA?",
46 | "path": "v1/completions",
47 | "body": {
48 | "model": "allenai/OLMo-1B-hf",
49 | "max_tokens": "1000",
50 | "temperature": "0",
51 | "stream": "true"
52 | }
53 | },
54 | "standby": {
55 | "gpu": "Blob",
56 | "pageable": "Blob",
57 | "pinned": "Blob"
58 | }
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/config/OLMo-7B-hf.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "allenai",
5 | "name": "OLMo-7B-hf",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "allenai/OLMo-7B-hf",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000",
15 | "--tensor-parallel-size=2"
16 | ],
17 | "resources": {
18 | "CPU": 20000,
19 | "Mem": 70000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 2,
23 | "vRam": 13800
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ],
31 | [
32 | "VLLM_CUDART_SO_PATH",
33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 | ]
35 | ],
36 | "mounts": [
37 | {
38 | "hostpath": "/home/brad/cache",
39 | "mountpath": "/root/.cache/huggingface"
40 | }
41 | ],
42 | "endpoint": {
43 | "port": 8000,
44 | "schema": "Http",
45 | "probe": "/health"
46 | },
47 | "sample_query": {
48 | "apiType": "openai",
49 | "prompt": "What is the capital of USA?",
50 | "path": "v1/completions",
51 | "body": {
52 | "model": "allenai/OLMo-7B-hf",
53 | "max_tokens": "1000",
54 | "temperature": "0",
55 | "stream": "true"
56 | }
57 | },
58 | "standby": {
59 | "gpu": "Blob",
60 | "pageable": "Blob",
61 | "pinned": "Blob"
62 | }
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/config/OLMoE-1B-7B-0924-Instruct.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "OLMoE-1B-7B-0924-Instruct",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "allenai/OLMoE-1B-7B-0924-Instruct",
11 | "--disable-custom-all-reduce",
12 | "--trust-remote-code",
13 | "--max-model-len",
14 | "2000",
15 | "--tensor-parallel-size=2"
16 | ],
17 | "resources": {
18 | "CPU": 6000,
19 | "Mem": 50000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 2,
23 | "vRam": 13800
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/Quark/target/debug/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "path": "/v1/completions",
40 | "port": 8000,
41 | "schema": "Http"
42 | },
43 | "probe": {
44 | "path": "/health",
45 | "port": 8000,
46 | "schema": "Http"
47 | },
48 | "api_type": {
49 | "openai": {
50 | "name": "allenai/OLMoE-1B-7B-0924-Instruct",
51 | "max_tokens": 1000,
52 | "temperature": 0
53 | }
54 | },
55 | "keepalive": "Blob"
56 | }
57 | }
--------------------------------------------------------------------------------
/config/OLMoE-1B-7B-0924.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "OLMoE-1B-7B-0924",
6 | "spec": {
7 | "image": "vllm-openai-upgraded:v.0.1",
8 | "commands": [
9 | "--model",
10 | "allenai/OLMoE-1B-7B-0924",
11 | "--enforce-eager",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 6000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13800
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/Quark/target/debug/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "path": "/v1/completions",
41 | "port": 8000,
42 | "schema": "Http"
43 | },
44 | "probe": {
45 | "path": "/health",
46 | "port": 8000,
47 | "schema": "Http"
48 | },
49 | "api_type": {
50 | "openai": {
51 | "name": "allenai/OLMoE-1B-7B-0924",
52 | "max_tokens": 1000,
53 | "temperature": 0
54 | }
55 | },
56 | "keepalive": "Blob"
57 | }
58 | }
--------------------------------------------------------------------------------
/config/OpenAssistant_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "OpenAssistant",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/Phi-3-mini-128k-instruct.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "microsoft",
5 | "name": "Phi-3-mini-128k-instruct",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "microsoft/Phi-3-mini-128k-instruct",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000"
16 | ],
17 | "resources": {
18 | "CPU": 12000,
19 | "Mem": 24000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 13000
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ],
31 | [
32 | "VLLM_CUDART_SO_PATH",
33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 | ]
35 | ],
36 | "mounts": [
37 | {
38 | "hostpath": "/home/brad/cache",
39 | "mountpath": "/root/.cache/huggingface"
40 | }
41 | ],
42 | "endpoint": {
43 | "port": 8000,
44 | "schema": "Http",
45 | "probe": "/health"
46 | },
47 | "sample_query": {
48 | "apiType": "openai",
49 | "prompt": "How to explain Internet for a medieval knight?",
50 | "path": "v1/completions",
51 | "body": {
52 | "model": "microsoft/Phi-3-mini-128k-instruct",
53 | "max_tokens": "1000",
54 | "temperature": "0",
55 | "stream": "true"
56 | }
57 | },
58 | "standby": {
59 | "gpu": "Blob",
60 | "pageable": "Blob",
61 | "pinned": "Blob"
62 | }
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/config/Phi-3-mini-4k-instruct.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "microsoft",
5 | "name": "Phi-3-mini-4k-instruct",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "microsoft/Phi-3-mini-4k-instruct",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000"
16 | ],
17 | "resources": {
18 | "CPU": 12000,
19 | "Mem": 24000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 13000
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ],
31 | [
32 | "VLLM_CUDART_SO_PATH",
33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 | ]
35 | ],
36 | "mounts": [
37 | {
38 | "hostpath": "/home/brad/cache",
39 | "mountpath": "/root/.cache/huggingface"
40 | }
41 | ],
42 | "endpoint": {
43 | "port": 8000,
44 | "schema": "Http",
45 | "probe": "/health"
46 | },
47 | "sample_query": {
48 | "apiType": "openai",
49 | "prompt": "Can you provide ways to eat combinations of bananas and dragonfruits?",
50 | "path": "v1/completions",
51 | "body": {
52 | "model": "microsoft/Phi-3-mini-4k-instruct",
53 | "max_tokens": "1000",
54 | "temperature": "0",
55 | "stream": "true"
56 | }
57 | },
58 | "standby": {
59 | "gpu": "Blob",
60 | "pageable": "Blob",
61 | "pinned": "Blob"
62 | }
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/config/Qwen-VL-Chat.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "Qwen-VL-Chat",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Qwen/Qwen-VL-Chat",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 70000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "Qwen/Qwen-VL-Chat",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/Qwen.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "Qwen",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "Qwen/Qwen2.5-3B-Instruct",
11 | "--enforce-eager"
12 | ],
13 | "resources": {
14 | "CPU": 100,
15 | "Mem": 200,
16 | "GPU": {
17 | "Type": "Any",
18 | "Usage": {
19 | "Partial": 100
20 | }
21 | }
22 | },
23 | "envs": [
24 | [
25 | "LD_LIBRARY_PATH",
26 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
27 | ]
28 | ],
29 | "mounts": [
30 | {
31 | "hostpath": "/home/brad/cache",
32 | "mountpath": "/root/.cache/huggingface"
33 | }
34 | ],
35 | "endpoint": {
36 | "path": "/v1/completions",
37 | "port": 8000,
38 | "schema": "Http"
39 | },
40 | "probe": {
41 | "path": "/health",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "api_type": {
46 | "openai": {
47 | "name": "Qwen/Qwen2.5-3B-Instruct",
48 | "max_tokens": 200,
49 | "temperature": 0
50 | }
51 | }
52 | }
53 | }
--------------------------------------------------------------------------------
/config/Qwen1.5-MoE-A2.7B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "Qwen1.5-MoE-A2.7B",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "Qwen/Qwen1.5-MoE-A2.7B",
11 | "--enforce-eager",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 6000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 15000
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "path": "/v1/completions",
41 | "port": 8000,
42 | "schema": "Http"
43 | },
44 | "probe": {
45 | "path": "/health",
46 | "port": 8000,
47 | "schema": "Http"
48 | },
49 | "api_type": {
50 | "openai": {
51 | "name": "Qwen/Qwen1.5-MoE-A2.7B",
52 | "max_tokens": 1000,
53 | "temperature": 0
54 | }
55 | },
56 | "keepalive": "Blob"
57 | }
58 | }
--------------------------------------------------------------------------------
/config/Qwen2.5-1.5B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Qwen",
5 | "name": "Qwen2.5-1.5B",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Qwen/Qwen2.5-1.5B",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000"
15 | ],
16 | "resources": {
17 | "CPU": 12000,
18 | "Mem": 24000,
19 | "GPU": {
20 | "Type": "Any",
21 | "Count": 1,
22 | "vRam": 8000
23 | }
24 | },
25 | "envs": [
26 | [
27 | "LD_LIBRARY_PATH",
28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 | ],
30 | [
31 | "VLLM_CUDART_SO_PATH",
32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 | ]
34 | ],
35 | "mounts": [
36 | {
37 | "hostpath": "/home/brad/cache",
38 | "mountpath": "/root/.cache/huggingface"
39 | }
40 | ],
41 | "endpoint": {
42 | "port": 8000,
43 | "schema": "Http",
44 | "probe": "/health"
45 | },
46 | "sample_query": {
47 | "apiType": "openai",
48 | "prompt": "Can you provide ways to eat combinations of bananas and dragonfruits?",
49 | "path": "v1/completions",
50 | "body": {
51 | "model": "Qwen/Qwen2.5-1.5B",
52 | "max_tokens": "1000",
53 | "temperature": "0",
54 | "stream": "true"
55 | }
56 | },
57 | "standby": {
58 | "gpu": "Blob",
59 | "pageable": "Blob",
60 | "pinned": "Blob"
61 | }
62 | }
63 | }
64 | }
--------------------------------------------------------------------------------
/config/Qwen2.5-7B-Instruct-GPTQ-Int8.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Qwen",
5 | "name": "Qwen2.5-7B-Instruct-GPTQ-Int8",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
12 | "--gpu-memory-utilization",
13 | "0.99",
14 | "--max-model-len",
15 | "500"
16 | ],
17 | "resources": {
18 | "CPU": 20000,
19 | "Mem": 30000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 14200
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ],
31 | [
32 | "VLLM_CUDART_SO_PATH",
33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 | ]
35 | ],
36 | "mounts": [
37 | {
38 | "hostpath": "/home/brad/cache",
39 | "mountpath": "/root/.cache/huggingface"
40 | }
41 | ],
42 | "endpoint": {
43 | "port": 8000,
44 | "schema": "Http",
45 | "probe": "/health"
46 | },
47 | "sample_query": {
48 | "apiType": "openai",
49 | "prompt": "Give me a short introduction to large language model.",
50 | "path": "v1/completions",
51 | "body": {
52 | "model": "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
53 | "max_tokens": "300",
54 | "temperature": "0",
55 | "stream": "true"
56 | }
57 | },
58 | "standby": {
59 | "gpu": "Blob",
60 | "pageable": "Blob",
61 | "pinned": "Blob"
62 | }
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/config/Qwen2.5-7B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "models--Qwen--Qwen2.5-7B",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "Qwen/Qwen2.5-7B",
11 | "--enforce-eager",
12 | "--max-model-len",
13 | "2000",
14 | "--tensor-parallel-size=2"
15 | ],
16 | "resources": {
17 | "CPU": 6000,
18 | "Mem": 80000,
19 | "GPU": {
20 | "Type": "Any",
21 | "Count": 2,
22 | "vRam": 14000
23 | }
24 | },
25 | "envs": [
26 | [
27 | "LD_LIBRARY_PATH",
28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 | ]
30 | ],
31 | "mounts": [
32 | {
33 | "hostpath": "/home/brad/cache",
34 | "mountpath": "/root/.cache/huggingface"
35 | }
36 | ],
37 | "endpoint": {
38 | "path": "/v1/completions",
39 | "port": 8000,
40 | "schema": "Http"
41 | },
42 | "probe": {
43 | "path": "/health",
44 | "port": 8000,
45 | "schema": "Http"
46 | },
47 | "api_type": {
48 | "openai": {
49 | "name": "Qwen/Qwen2.5-7B",
50 | "max_tokens": 1000,
51 | "temperature": 0
52 | }
53 | }
54 | }
55 | }
--------------------------------------------------------------------------------
/config/Qwen2.5-Coder-1.5B-Instruct.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Qwen",
5 | "name": "Qwen2.5-Coder-1.5B-Instruct",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Qwen/Qwen2.5-Coder-1.5B-Instruct",
12 | "--max-model-len",
13 | "1000"
14 | ],
15 | "resources": {
16 | "CPU": 12000,
17 | "Mem": 24000,
18 | "GPU": {
19 | "Type": "Any",
20 | "Count": 1,
21 | "vRam": 6000
22 | }
23 | },
24 | "envs": [
25 | [
26 | "LD_LIBRARY_PATH",
27 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
28 | ],
29 | [
30 | "VLLM_CUDART_SO_PATH",
31 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "port": 8000,
42 | "schema": "Http",
43 | "probe": "/health"
44 | },
45 | "sample_query": {
46 | "apiType": "openai",
47 | "prompt": "write a quick sort algorithm.",
48 | "path": "v1/completions",
49 | "body": {
50 | "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
51 | "max_tokens": "800",
52 | "temperature": "0",
53 | "stream": "true"
54 | }
55 | },
56 | "standby": {
57 | "gpu": "Blob",
58 | "pageable": "Blob",
59 | "pinned": "Blob"
60 | }
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/config/Qwen2.5-Coder-3B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Qwen",
5 | "name": "Qwen2.5-Coder-3B",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Qwen/Qwen2.5-Coder-3B",
12 | "--max-model-len",
13 | "1000"
14 | ],
15 | "resources": {
16 | "CPU": 12000,
17 | "Mem": 24000,
18 | "GPU": {
19 | "Type": "Any",
20 | "Count": 1,
21 | "vRam": 10000
22 | }
23 | },
24 | "envs": [
25 | [
26 | "LD_LIBRARY_PATH",
27 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
28 | ],
29 | [
30 | "VLLM_CUDART_SO_PATH",
31 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "port": 8000,
42 | "schema": "Http",
43 | "probe": "/health"
44 | },
45 | "sample_query": {
46 | "apiType": "openai",
47 | "prompt": "write a quick sort algorithm.",
48 | "path": "v1/completions",
49 | "body": {
50 | "model": "Qwen/Qwen2.5-Coder-3B",
51 | "max_tokens": "800",
52 | "temperature": "0",
53 | "stream": "true"
54 | }
55 | },
56 | "standby": {
57 | "gpu": "Blob",
58 | "pageable": "Blob",
59 | "pinned": "Blob"
60 | }
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/config/Qwen2.5-Coder-7B-Instruct.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Qwen",
5 | "name": "Qwen2.5-Coder-7B-Instruct",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Qwen/Qwen2.5-Coder-7B-Instruct",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13800
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ],
32 | [
33 | "VLLM_CUDART_SO_PATH",
34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 | ]
36 | ],
37 | "mounts": [
38 | {
39 | "hostpath": "/home/brad/cache",
40 | "mountpath": "/root/.cache/huggingface"
41 | }
42 | ],
43 | "endpoint": {
44 | "port": 8000,
45 | "schema": "Http",
46 | "probe": "/health"
47 | },
48 | "sample_query": {
49 | "apiType": "openai",
50 | "prompt": "write a quick sort algorithm.",
51 | "path": "v1/completions",
52 | "body": {
53 | "model": "Qwen/Qwen2.5-Coder-7B-Instruct",
54 | "max_tokens": "1000",
55 | "temperature": "0",
56 | "stream": "true"
57 | }
58 | },
59 | "standby": {
60 | "gpu": "Blob",
61 | "pageable": "Blob",
62 | "pinned": "Blob"
63 | }
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/config/Qwen2.5-Math-1.5B-Instruct.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Qwen",
5 | "name": "Qwen2.5-Math-1.5B-Instruct",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Qwen/Qwen2.5-Math-1.5B-Instruct"
12 | ],
13 | "resources": {
14 | "CPU": 12000,
15 | "Mem": 24000,
16 | "GPU": {
17 | "Type": "Any",
18 | "Count": 1,
19 | "vRam": 7000
20 | }
21 | },
22 | "envs": [
23 | [
24 | "LD_LIBRARY_PATH",
25 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
26 | ],
27 | [
28 | "VLLM_CUDART_SO_PATH",
29 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "port": 8000,
40 | "schema": "Http",
41 | "probe": "/health"
42 | },
43 | "sample_query": {
44 | "apiType": "openai",
45 | "prompt": "Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$.",
46 | "path": "v1/completions",
47 | "body": {
48 | "model": "Qwen/Qwen2.5-Math-1.5B-Instruct",
49 | "max_tokens": "200",
50 | "temperature": "0",
51 | "stream": "true"
52 | }
53 | },
54 | "standby": {
55 | "gpu": "Blob",
56 | "pageable": "Blob",
57 | "pinned": "Blob"
58 | }
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/config/Qwen2.5-Math-1.5B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Qwen",
5 | "name": "Qwen2.5-Math-1.5B",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Qwen/Qwen2.5-Math-1.5B",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000"
15 | ],
16 | "resources": {
17 | "CPU": 12000,
18 | "Mem": 24000,
19 | "GPU": {
20 | "Type": "Any",
21 | "Count": 1,
22 | "vRam": 8000
23 | }
24 | },
25 | "envs": [
26 | [
27 | "LD_LIBRARY_PATH",
28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 | ],
30 | [
31 | "VLLM_CUDART_SO_PATH",
32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 | ]
34 | ],
35 | "mounts": [
36 | {
37 | "hostpath": "/home/brad/cache",
38 | "mountpath": "/root/.cache/huggingface"
39 | }
40 | ],
41 | "endpoint": {
42 | "port": 8000,
43 | "schema": "Http",
44 | "probe": "/health"
45 | },
46 | "sample_query": {
47 | "apiType": "openai",
48 | "prompt": "Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$.",
49 | "path": "v1/completions",
50 | "body": {
51 | "model": "Qwen/Qwen2.5-Math-1.5B",
52 | "max_tokens": "1000",
53 | "temperature": "0",
54 | "stream": "true"
55 | }
56 | },
57 | "standby": {
58 | "gpu": "Blob",
59 | "pageable": "Blob",
60 | "pinned": "Blob"
61 | }
62 | }
63 | }
64 | }
--------------------------------------------------------------------------------
/config/Qwen2.5-Math-7B.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Qwen",
5 | "name": "Qwen2.5-Math-7B",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "Qwen/Qwen2.5-Math-7B",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13800
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ],
32 | [
33 | "VLLM_CUDART_SO_PATH",
34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 | ]
36 | ],
37 | "mounts": [
38 | {
39 | "hostpath": "/home/brad/cache",
40 | "mountpath": "/root/.cache/huggingface"
41 | }
42 | ],
43 | "endpoint": {
44 | "port": 8000,
45 | "schema": "Http",
46 | "probe": "/health"
47 | },
48 | "sample_query": {
49 | "apiType": "openai",
50 | "prompt": "Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$.",
51 | "path": "v1/completions",
52 | "body": {
53 | "model": "Qwen/Qwen2.5-Math-7B",
54 | "max_tokens": "1000",
55 | "temperature": "0",
56 | "stream": "true"
57 | }
58 | },
59 | "standby": {
60 | "gpu": "Blob",
61 | "pageable": "Blob",
62 | "pinned": "Blob"
63 | }
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/config/Qwen7BInt8.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "Qwen",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
11 | "--enforce-eager",
12 | "--gpu-memory-utilization 0.99",
13 | "--max-model-len 1000"
14 | ],
15 | "resources": {
16 | "CPU": 100,
17 | "Mem": 200,
18 | "GPU": {
19 | "Type": "RTX3060",
20 | "Count": 1
21 | }
22 | },
23 | "envs": [
24 | [
25 | "LD_LIBRARY_PATH",
26 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
27 | ]
28 | ],
29 | "mounts": [
30 | {
31 | "hostpath": "/home/brad/cache",
32 | "mountpath": "/root/.cache/huggingface"
33 | }
34 | ],
35 | "endpoint": {
36 | "path": "/v1/completions",
37 | "port": 8000,
38 | "schema": "Http"
39 | },
40 | "probe": {
41 | "path": "/health",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "api_type": {
46 | "openai": {
47 | "name": "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
48 | "max_tokens": 200,
49 | "temperature": 0
50 | }
51 | }
52 | }
53 | }
--------------------------------------------------------------------------------
/config/Qwen_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "Qwen",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/Salesforce_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "Salesforce",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/THUDM_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "THUDM",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/TinyLlama-1.1B-Chat-v1.0.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "TinyLlama",
5 | "name": "TinyLlama-1.1B-Chat-v1.0",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000"
15 | ],
16 | "resources": {
17 | "CPU": 20000,
18 | "Mem": 24000,
19 | "GPU": {
20 | "Type": "Any",
21 | "Count": 1,
22 | "vRam": 4800
23 | }
24 | },
25 | "envs": [
26 | [
27 | "LD_LIBRARY_PATH",
28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 | ],
30 | [
31 | "VLLM_CUDART_SO_PATH",
32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 | ]
34 | ],
35 | "mounts": [
36 | {
37 | "hostpath": "/home/brad/cache",
38 | "mountpath": "/root/.cache/huggingface"
39 | }
40 | ],
41 | "endpoint": {
42 | "port": 8000,
43 | "schema": "Http",
44 | "probe": "/health"
45 | },
46 | "sample_query": {
47 | "apiType": "openai",
48 | "prompt": "Seattle is a",
49 | "path": "v1/completions",
50 | "body": {
51 | "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
52 | "max_tokens": "1000",
53 | "temperature": "0",
54 | "stream": "true"
55 | }
56 | },
57 | "standby": {
58 | "gpu": "File",
59 | "pageable": "File",
60 | "pinned": "File"
61 | }
62 | }
63 | }
64 | }
--------------------------------------------------------------------------------
/config/TinyLlama-1.1B-Chat-v1.0_13GB.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "TinyLlama",
5 | "name": "TinyLlama-1.1B-Chat-v1.0_13GB",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000"
15 | ],
16 | "resources": {
17 | "CPU": 20000,
18 | "Mem": 24000,
19 | "GPU": {
20 | "Type": "Any",
21 | "Count": 1,
22 | "vRam": 13800
23 | }
24 | },
25 | "envs": [
26 | [
27 | "LD_LIBRARY_PATH",
28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 | ],
30 | [
31 | "VLLM_CUDART_SO_PATH",
32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 | ]
34 | ],
35 | "mounts": [
36 | {
37 | "hostpath": "/home/brad/cache",
38 | "mountpath": "/root/.cache/huggingface"
39 | }
40 | ],
41 | "endpoint": {
42 | "port": 8000,
43 | "schema": "Http",
44 | "probe": "/health"
45 | },
46 | "sample_query": {
47 | "apiType": "openai",
48 | "prompt": "Seattle is a",
49 | "path": "v1/completions",
50 | "body": {
51 | "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
52 | "max_tokens": "1000",
53 | "temperature": "0",
54 | "stream": "true"
55 | }
56 | },
57 | "standby": {
58 | "gpu": "Blob",
59 | "pageable": "Blob",
60 | "pinned": "Blob"
61 | }
62 | }
63 | }
64 | }
--------------------------------------------------------------------------------
/config/TinyLlama-1.1B-Chat-v1.0_2gpu.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "TinyLlama",
5 | "name": "TinyLlama-1.1B-Chat-v1.0_2gpu",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000",
15 | "--tensor-parallel-size=2"
16 | ],
17 | "resources": {
18 | "CPU": 20000,
19 | "Mem": 50000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 2,
23 | "vRam": 13800
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "port": 8000,
40 | "schema": "Http",
41 | "probe": "/health"
42 | },
43 | "sample_query": {
44 | "apiType": "openai",
45 | "prompt": "Seattle is a",
46 | "path": "v1/completions",
47 | "body": {
48 | "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
49 | "max_tokens": "1000",
50 | "temperature": "0",
51 | "stream": "true"
52 | }
53 | },
54 | "standby": {
55 | "gpu": "Blob",
56 | "pageable": "Blob",
57 | "pinned": "Blob"
58 | }
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/config/TinyLlama-1.1B-Chat-v1.0_test.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "TinyLlama-1.1B-Chat-v1.0_test",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.4.2",
9 | "commands": [
10 | "--model",
11 | "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000"
15 | ],
16 | "resources": {
17 | "CPU": 20000,
18 | "Mem": 18000,
19 | "GPU": {
20 | "Type": "Any",
21 | "Count": 1,
22 | "vRam": 4500
23 | }
24 | },
25 | "envs": [
26 | [
27 | "LD_LIBRARY_PATH",
28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 | ]
30 | ],
31 | "mounts": [
32 | {
33 | "hostpath": "/home/brad/cache",
34 | "mountpath": "/root/.cache/huggingface"
35 | }
36 | ],
37 | "endpoint": {
38 | "port": 8000,
39 | "schema": "Http",
40 | "probe": "/health"
41 | },
42 | "sample_query": {
43 | "apiType": "openai",
44 | "prompt": "Seattle is a",
45 | "path": "v1/completions",
46 | "body": {
47 | "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
48 | "max_tokens": "1000",
49 | "temperature": "0",
50 | "stream": "true"
51 | }
52 | },
53 | "standby": {
54 | "gpu": "File",
55 | "pageable": "File",
56 | "pinned": "File"
57 | }
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/TinyLlama_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "TinyLlama",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/XVERSE-13B-Chat.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "XVERSE-13B-Chat",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "xverse/XVERSE-13B-Chat",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "xverse/XVERSE-13B-Chat",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/XVERSE-7B-Chat.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "XVERSE-7B-Chat",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "xverse/XVERSE-7B-Chat",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "xverse/XVERSE-7B-Chat",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/allenai_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "allenai",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/baichuan-inc_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "baichuan-inc",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/bigcode_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "bigcode",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/chatglm3-6b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "THUDM",
5 | "name": "chatglm3-6b",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "THUDM/chatglm3-6b",
12 | "--enforce-eager",
13 | "--max-model-len",
14 | "1500",
15 | "--gpu-memory-utilization",
16 | "0.99",
17 | "--trust-remote-code"
18 | ],
19 | "resources": {
20 | "CPU": 12000,
21 | "Mem": 24000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 1,
25 | "vRam": 13800
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ],
33 | [
34 | "VLLM_CUDART_SO_PATH",
35 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
36 | ]
37 | ],
38 | "mounts": [
39 | {
40 | "hostpath": "/home/brad/cache",
41 | "mountpath": "/root/.cache/huggingface"
42 | }
43 | ],
44 | "endpoint": {
45 | "port": 8000,
46 | "schema": "Http",
47 | "probe": "/health"
48 | },
49 | "sample_query": {
50 | "apiType": "openai",
51 | "prompt": "Give me a short introduction to large language model.",
52 | "path": "v1/completions",
53 | "body": {
54 | "model": "THUDM/chatglm3-6b",
55 | "max_tokens": "200",
56 | "temperature": "0",
57 | "stream": "true"
58 | }
59 | },
60 | "standby": {
61 | "gpu": "Blob",
62 | "pageable": "Blob",
63 | "pinned": "Blob"
64 | }
65 | }
66 | }
67 | }
--------------------------------------------------------------------------------
/config/codegen-2B-multi.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "Salesforce",
5 | "name": "codegen-2B-multi",
6 | "object": {
7 | "spec": {
8 | "image": "vllm-openai-upgraded:v0.1.0",
9 | "entrypoint": [
10 | "/usr/bin/python3"
11 | ],
12 | "commands": [
13 | "/usr/lib/run_model.py",
14 | "Salesforce/codegen-2B-multi",
15 | "200"
16 | ],
17 | "resources": {
18 | "CPU": 20000,
19 | "Mem": 12000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 13000
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "port": 8000,
40 | "schema": "Http",
41 | "probe": "/health"
42 | },
43 | "sample_query": {
44 | "apiType": "standard",
45 | "prompt": "def hello_world():",
46 | "path": "v1/completions",
47 | "body": {
48 | "model": "N/A",
49 | "max_tokens": "200",
50 | "temperature": "0"
51 | }
52 | },
53 | "standby": {
54 | "gpu": "Blob",
55 | "pageable": "Blob",
56 | "pinned": "Blob"
57 | }
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/core42_jais-13b-bnb-4bit.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "core42_jais-13b-bnb-4bit",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "jwnder/core42_jais-13b-bnb-4bit",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000"
17 | ],
18 | "resources": {
19 | "CPU": 6000,
20 | "Mem": 20000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 1,
24 | "vRam": 15000
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "path": "/v1/completions",
41 | "port": 8000,
42 | "schema": "Http"
43 | },
44 | "probe": {
45 | "path": "/health",
46 | "port": 8000,
47 | "schema": "Http"
48 | },
49 | "api_type": {
50 | "openai": {
51 | "name": "jwnder/core42_jais-13b-bnb-4bit",
52 | "max_tokens": 1000,
53 | "temperature": 0
54 | }
55 | },
56 | "keepalive": "Blob"
57 | }
58 | }
59 | }
--------------------------------------------------------------------------------
/config/core42_jais-13b-chat-bnb-4bit.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "core42_jais-13b-chat-bnb-4bit",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "jwnder/core42_jais-13b-chat-bnb-4bit",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000"
17 | ],
18 | "resources": {
19 | "CPU": 6000,
20 | "Mem": 20000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 1,
24 | "vRam": 15000
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "path": "/v1/completions",
41 | "port": 8000,
42 | "schema": "Http"
43 | },
44 | "probe": {
45 | "path": "/health",
46 | "port": 8000,
47 | "schema": "Http"
48 | },
49 | "api_type": {
50 | "openai": {
51 | "name": "jwnder/core42_jais-13b-chat-bnb-4bit",
52 | "max_tokens": 1000,
53 | "temperature": 0
54 | }
55 | },
56 | "keepalive": "Blob"
57 | }
58 | }
59 | }
--------------------------------------------------------------------------------
/config/databricks_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "databricks",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/deepseek-ai_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "deepseek-ai",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/deepseek-math-7b-rl.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "deepseek-ai",
5 | "name": "deepseek-math-7b-rl",
6 | "object": {
7 | "spec": {
8 | "image": "vllm-openai-upgraded:v.0.1",
9 | "commands": [
10 | "--model",
11 | "deepseek-ai/deepseek-math-7b-rl",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 20000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 13000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "port": 8000,
42 | "schema": "Http",
43 | "probe": "/health"
44 | },
45 | "sample_query": {
46 | "apiType": "openai",
47 | "prompt": "what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \\boxed{}.",
48 | "path": "v1/completions",
49 | "body": {
50 | "model": "deepseek-ai/deepseek-math-7b-rl",
51 | "max_tokens": "1000",
52 | "temperature": "0",
53 | "stream": "true"
54 | }
55 | },
56 | "standby": {
57 | "gpu": "Blob",
58 | "pageable": "Blob",
59 | "pinned": "Blob"
60 | }
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/config/deepseek-vl2-tiny.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "deepseek-ai",
5 | "name": "deepseek-vl2-tiny",
6 | "object": {
7 | "spec": {
8 | "image": "vllm-openai-upgraded:v.0.1",
9 | "commands": [
10 | "--model",
11 | "deepseek-ai/deepseek-vl2-tiny",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 20000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 13000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "port": 8000,
42 | "schema": "Http",
43 | "probe": "/health"
44 | },
45 | "sample_query": {
46 | "apiType": "openai",
47 | "prompt": "What is the capital of USA?",
48 | "path": "v1/completions",
49 | "body": {
50 | "model": "deepseek-ai/deepseek-vl2-tiny",
51 | "max_tokens": "1000",
52 | "temperature": "0",
53 | "stream": "true"
54 | }
55 | },
56 | "standby": {
57 | "gpu": "Blob",
58 | "pageable": "Blob",
59 | "pinned": "Blob"
60 | }
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/config/facebook_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "facebook",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/falcon-7b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "tiiuae",
5 | "name": "falcon-7b",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "tiiuae/falcon-7b",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "tiiuae/falcon-7b",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/falcon-rw-7b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "tiiuae",
5 | "name": "falcon-rw-7b",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "tiiuae/falcon-rw-7b",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000",
15 | "--tensor-parallel-size=2"
16 | ],
17 | "resources": {
18 | "CPU": 12000,
19 | "Mem": 80000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 2,
23 | "vRam": 13800
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ],
31 | [
32 | "VLLM_CUDART_SO_PATH",
33 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 | ]
35 | ],
36 | "mounts": [
37 | {
38 | "hostpath": "/home/brad/cache",
39 | "mountpath": "/root/.cache/huggingface"
40 | }
41 | ],
42 | "endpoint": {
43 | "port": 8000,
44 | "schema": "Http",
45 | "probe": "/health"
46 | },
47 | "sample_query": {
48 | "apiType": "openai",
49 | "prompt": "Here is a recipe for vegan banana bread:",
50 | "path": "v1/completions",
51 | "body": {
52 | "model": "tiiuae/falcon-rw-7b",
53 | "max_tokens": "1000",
54 | "temperature": "0",
55 | "stream": "true"
56 | }
57 | },
58 | "standby": {
59 | "gpu": "Blob",
60 | "pageable": "Blob",
61 | "pinned": "Blob"
62 | }
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/config/gemma-7b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "gemma-7b",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "google/gemma-7b",
11 | "--enforce-eager",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 6000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 15000
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ]
32 | ],
33 | "mounts": [
34 | {
35 | "hostpath": "/home/brad/cache",
36 | "mountpath": "/root/.cache/huggingface"
37 | }
38 | ],
39 | "endpoint": {
40 | "path": "/v1/completions",
41 | "port": 8000,
42 | "schema": "Http"
43 | },
44 | "probe": {
45 | "path": "/health",
46 | "port": 8000,
47 | "schema": "Http"
48 | },
49 | "api_type": {
50 | "openai": {
51 | "name": "google/gemma-7b",
52 | "max_tokens": 1000,
53 | "temperature": 0
54 | }
55 | },
56 | "keepalive": "Blob"
57 | }
58 | }
--------------------------------------------------------------------------------
/config/gpt-j-6b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "gpt-j-6b",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "EleutherAI/gpt-j-6b",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 70000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "EleutherAI/gpt-j-6b",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/gpt2-xl.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "openai-community",
5 | "name": "gpt2-xl",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "openai-community/gpt2-xl",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "800"
15 | ],
16 | "resources": {
17 | "CPU": 12000,
18 | "Mem": 24000,
19 | "GPU": {
20 | "Type": "Any",
21 | "Count": 1,
22 | "vRam": 12000
23 | }
24 | },
25 | "envs": [
26 | [
27 | "LD_LIBRARY_PATH",
28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 | ],
30 | [
31 | "VLLM_CUDART_SO_PATH",
32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 | ]
34 | ],
35 | "mounts": [
36 | {
37 | "hostpath": "/home/brad/cache",
38 | "mountpath": "/root/.cache/huggingface"
39 | }
40 | ],
41 | "endpoint": {
42 | "port": 8000,
43 | "schema": "Http",
44 | "probe": "/health"
45 | },
46 | "sample_query": {
47 | "apiType": "openai",
48 | "prompt": "Here is a recipe for vegan banana bread:",
49 | "path": "v1/completions",
50 | "body": {
51 | "model": "openai-community/gpt2-xl",
52 | "max_tokens": "600",
53 | "temperature": "0",
54 | "stream": "true"
55 | }
56 | },
57 | "standby": {
58 | "gpu": "Blob",
59 | "pageable": "Blob",
60 | "pinned": "Blob"
61 | }
62 | }
63 | }
64 | }
--------------------------------------------------------------------------------
/config/gpt4all-j.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "nomic-ai",
5 | "name": "gpt4all-j",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "nomic-ai/gpt4all-j",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 60000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13800
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ],
32 | [
33 | "VLLM_CUDART_SO_PATH",
34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 | ]
36 | ],
37 | "mounts": [
38 | {
39 | "hostpath": "/home/brad/cache",
40 | "mountpath": "/root/.cache/huggingface"
41 | }
42 | ],
43 | "endpoint": {
44 | "port": 8000,
45 | "schema": "Http",
46 | "probe": "/health"
47 | },
48 | "sample_query": {
49 | "apiType": "openai",
50 | "prompt": "Here is a recipe for vegan banana bread:",
51 | "path": "v1/completions",
52 | "body": {
53 | "model": "nomic-ai/gpt4all-j",
54 | "max_tokens": "1000",
55 | "temperature": "0",
56 | "stream": "true"
57 | }
58 | },
59 | "standby": {
60 | "gpu": "Blob",
61 | "pageable": "Blob",
62 | "pinned": "Blob"
63 | }
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/config/internlm2-7b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "internlm2-7b",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "internlm/internlm2-7b",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "internlm/internlm2-7b",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/internlm2_5-7b-chat.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "internlm2_5-7b-chat",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "internlm/internlm2_5-7b-chat",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "internlm/internlm2_5-7b-chat",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/llama_8BInt8.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "TinyLlama-1.1B-Chat-v1.0_2gpu",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "meta-llama/Llama-Guard-3-8B-INT8",
11 | "--enforce-eager",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000",
15 | "--tensor-parallel-size=2"
16 | ],
17 | "resources": {
18 | "CPU": 6000,
19 | "Mem": 50000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 2,
23 | "vRam": 13800
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/Quark/target/debug/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "path": "/v1/completions",
40 | "port": 8000,
41 | "schema": "Http"
42 | },
43 | "probe": {
44 | "path": "/health",
45 | "port": 8000,
46 | "schema": "Http"
47 | },
48 | "api_type": {
49 | "openai": {
50 | "name": "meta-llama/Llama-Guard-3-8B-INT8",
51 | "max_tokens": 1000,
52 | "temperature": 0
53 | }
54 | },
55 | "keepalive": "Blob"
56 | }
57 | }
--------------------------------------------------------------------------------
/config/llava-1.5-7b-hf.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "llava-hf",
5 | "name": "llava-1.5-7b-hf",
6 | "object": {
7 | "spec": {
8 | "image": "vllm-openai-upgraded:v0.1.0",
9 | "entrypoint": [
10 | "/usr/bin/python3"
11 | ],
12 | "commands": [
13 | "/usr/lib/run_llava.py"
14 | ],
15 | "resources": {
16 | "CPU": 20000,
17 | "Mem": 12000,
18 | "GPU": {
19 | "Type": "Any",
20 | "Count": 1,
21 | "vRam": 14000
22 | }
23 | },
24 | "envs": [
25 | [
26 | "LD_LIBRARY_PATH",
27 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
28 | ]
29 | ],
30 | "mounts": [
31 | {
32 | "hostpath": "/home/brad/cache",
33 | "mountpath": "/root/.cache/huggingface"
34 | }
35 | ],
36 | "endpoint": {
37 | "port": 8000,
38 | "schema": "Http",
39 | "probe": "/health"
40 | },
41 | "sample_query": {
42 | "apiType": "llava",
43 | "prompt": "What is shown in this image?",
44 | "path": "v1/completions",
45 | "body": {
46 | "image": "https://www.ilankelman.org/stopsigns/australia.jpg"
47 | }
48 | },
49 | "standby": {
50 | "gpu": "Blob",
51 | "pageable": "Blob",
52 | "pinned": "Blob"
53 | }
54 | }
55 | }
56 | }
--------------------------------------------------------------------------------
/config/llava-hf_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "llava-hf",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/mamba-1.4b-hf.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "state-spaces",
5 | "name": "mamba-1.4b-hf",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "state-spaces/mamba-1.4b-hf",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--max-model-len",
15 | "2000"
16 | ],
17 | "resources": {
18 | "CPU": 12000,
19 | "Mem": 50000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 13800
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "port": 8000,
40 | "schema": "Http",
41 | "probe": "/health"
42 | },
43 | "sample_query": {
44 | "apiType": "openai",
45 | "prompt": "Hey how are you doing?\n\nI'm doing great.\n\nI",
46 | "path": "v1/completions",
47 | "body": {
48 | "model": "state-spaces/mamba-1.4b-hf",
49 | "max_tokens": "1000",
50 | "temperature": "0",
51 | "stream": "true"
52 | }
53 | },
54 | "standby": {
55 | "gpu": "Blob",
56 | "pageable": "Blob",
57 | "pinned": "Blob"
58 | }
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/config/mamba-2.8b-hf.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "state-spaces",
5 | "name": "mamba-2.8b-hf",
6 | "object": {
7 | "spec": {
8 | "image": "vllm-openai-upgraded:v.0.1",
9 | "commands": [
10 | "--model",
11 | "state-spaces/mamba-2.8b-hf",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--max-model-len",
15 | "2000"
16 | ],
17 | "resources": {
18 | "CPU": 12000,
19 | "Mem": 50000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 13800
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "port": 8000,
40 | "schema": "Http",
41 | "probe": "/health"
42 | },
43 | "sample_query": {
44 | "apiType": "openai",
45 | "prompt": "Hey how are you doing?\n\nI'm doing great.\n\nI",
46 | "path": "v1/completions",
47 | "body": {
48 | "model": "state-spaces/mamba-2.8b-hf",
49 | "max_tokens": "1000",
50 | "temperature": "0",
51 | "stream": "true"
52 | }
53 | },
54 | "standby": {
55 | "gpu": "Blob",
56 | "pageable": "Blob",
57 | "pinned": "Blob"
58 | }
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/config/meta-llama_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "meta-llama",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/microsoft_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "microsoft",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/mistral.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "mistral",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "mistralai/Mistral-7B-v0.1",
11 | "--enforce-eager"
12 | ],
13 | "resources": {
14 | "CPU": 100,
15 | "Mem": 200,
16 | "GPU": {
17 | "Type": "RTX3060",
18 | "Count": 2
19 | }
20 | },
21 | "envs": [
22 | [
23 | "LD_LIBRARY_PATH",
24 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
25 | ]
26 | ],
27 | "mounts": [
28 | {
29 | "hostpath": "/home/brad/cache",
30 | "mountpath": "/root/.cache/huggingface"
31 | }
32 | ],
33 | "endpoint": {
34 | "path": "/v1/completions",
35 | "port": 8000,
36 | "schema": "Http"
37 | },
38 | "probe": {
39 | "path": "/health",
40 | "port": 8000,
41 | "schema": "Http"
42 | },
43 | "api_type": {
44 | "openai": {
45 | "name": "mistralai/Mistral-7B-v0.1",
46 | "max_tokens": 200,
47 | "temperature": 0
48 | }
49 | }
50 | }
51 | }
--------------------------------------------------------------------------------
/config/mistralai_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "mistralai",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/mosaicml_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "mosaicml",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/mpt-7b-storywriter.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "mosaicml",
5 | "name": "mpt-7b-storywriter",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "mosaicml/mpt-7b-storywriter",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "1000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13800
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ],
32 | [
33 | "VLLM_CUDART_SO_PATH",
34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 | ]
36 | ],
37 | "mounts": [
38 | {
39 | "hostpath": "/home/brad/cache",
40 | "mountpath": "/root/.cache/huggingface"
41 | }
42 | ],
43 | "endpoint": {
44 | "port": 8000,
45 | "schema": "Http",
46 | "probe": "/health"
47 | },
48 | "sample_query": {
49 | "apiType": "openai",
50 | "prompt": "Here is a recipe for vegan banana bread:",
51 | "path": "v1/completions",
52 | "body": {
53 | "model": "mosaicml/mpt-7b-storywriter",
54 | "max_tokens": "800",
55 | "temperature": "0",
56 | "stream": "true"
57 | }
58 | },
59 | "standby": {
60 | "gpu": "Blob",
61 | "pageable": "Blob",
62 | "pinned": "Blob"
63 | }
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/config/mpt-7b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "mosaicml",
5 | "name": "mpt-7b",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "mosaicml/mpt-7b",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "1000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13800
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ],
32 | [
33 | "VLLM_CUDART_SO_PATH",
34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 | ]
36 | ],
37 | "mounts": [
38 | {
39 | "hostpath": "/home/brad/cache",
40 | "mountpath": "/root/.cache/huggingface"
41 | }
42 | ],
43 | "endpoint": {
44 | "port": 8000,
45 | "schema": "Http",
46 | "probe": "/health"
47 | },
48 | "sample_query": {
49 | "apiType": "openai",
50 | "prompt": "Here is a recipe for vegan banana bread:",
51 | "path": "v1/completions",
52 | "body": {
53 | "model": "mosaicml/mpt-7b",
54 | "max_tokens": "800",
55 | "temperature": "0",
56 | "stream": "true"
57 | }
58 | },
59 | "standby": {
60 | "gpu": "Blob",
61 | "pageable": "Blob",
62 | "pinned": "Blob"
63 | }
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/config/namespace1.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "t1",
4 | "namespace": "system",
5 | "name": "ns1",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/nomic-ai_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "nomic-ai",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/ns1_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "t1",
4 | "namespace": "system",
5 | "name": "ns1",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/openai-community_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "openai-community",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/openbmb_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "openbmb",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/opt-iml-max-1.3b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "facebook",
5 | "name": "opt-iml-max-1.3b",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "facebook/opt-iml-max-1.3b",
12 | "--max-model-len",
13 | "200"
14 | ],
15 | "resources": {
16 | "CPU": 12000,
17 | "Mem": 24000,
18 | "GPU": {
19 | "Type": "Any",
20 | "Count": 1,
21 | "vRam": 4500
22 | }
23 | },
24 | "envs": [
25 | [
26 | "LD_LIBRARY_PATH",
27 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
28 | ],
29 | [
30 | "VLLM_CUDART_SO_PATH",
31 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "port": 8000,
42 | "schema": "Http",
43 | "probe": "/health"
44 | },
45 | "sample_query": {
46 | "apiType": "openai",
47 | "prompt": "What is the capital of USA?",
48 | "path": "v1/completions",
49 | "body": {
50 | "model": "facebook/opt-iml-max-1.3b",
51 | "max_tokens": "100",
52 | "temperature": "0",
53 | "stream": "true"
54 | }
55 | },
56 | "standby": {
57 | "gpu": "Mem",
58 | "pageable": "File",
59 | "pinned": "Mem"
60 | }
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/config/persimmon-8b-base.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "persimmon-8b-base",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "adept/persimmon-8b-base",
11 | "--enforce-eager",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000"
16 | ],
17 | "resources": {
18 | "CPU": 6000,
19 | "Mem": 18000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 2,
23 | "vRam": 15000
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "path": "/v1/completions",
40 | "port": 8000,
41 | "schema": "Http"
42 | },
43 | "probe": {
44 | "path": "/health",
45 | "port": 8000,
46 | "schema": "Http"
47 | },
48 | "api_type": {
49 | "openai": {
50 | "name": "adept/persimmon-8b-base",
51 | "max_tokens": 1000,
52 | "temperature": 0
53 | }
54 | },
55 | "keepalive": "Blob"
56 | }
57 | }
--------------------------------------------------------------------------------
/config/persimmon-8b-chat.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "persimmon-8b-chat",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "adept/persimmon-8b-chat",
11 | "--enforce-eager",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000"
16 | ],
17 | "resources": {
18 | "CPU": 6000,
19 | "Mem": 18000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 2,
23 | "vRam": 15000
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "path": "/v1/completions",
40 | "port": 8000,
41 | "schema": "Http"
42 | },
43 | "probe": {
44 | "path": "/health",
45 | "port": 8000,
46 | "schema": "Http"
47 | },
48 | "api_type": {
49 | "openai": {
50 | "name": "adept/persimmon-8b-chat",
51 | "max_tokens": 1000,
52 | "temperature": 0
53 | }
54 | },
55 | "keepalive": "Blob"
56 | }
57 | }
--------------------------------------------------------------------------------
/config/public.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "tenant",
3 | "tenant": "system",
4 | "namespace": "system",
5 | "name": "public",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/reader.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "reader-lm",
6 | "spec": {
7 | "image": "vllm/vllm-openai:v0.7.3",
8 | "commands": [
9 | "--model",
10 | "jinaai/reader-lm-1.5b",
11 | "--enforce-eager"
12 | ],
13 | "resources": {
14 | "CPU": 100,
15 | "Mem": 200,
16 | "GPU": {
17 | "Type": "RTX3060",
18 | "Count": 1
19 | }
20 | },
21 | "envs": [
22 | [
23 | "LD_LIBRARY_PATH",
24 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
25 | ]
26 | ],
27 | "mounts": [
28 | {
29 | "hostpath": "/home/brad/cache",
30 | "mountpath": "/root/.cache/huggingface"
31 | }
32 | ],
33 | "endpoint": {
34 | "path": "/v1/completions",
35 | "port": 8000,
36 | "schema": "Http"
37 | },
38 | "probe": {
39 | "path": "/health",
40 | "port": 8000,
41 | "schema": "Http"
42 | },
43 | "api_type": {
44 | "openai": {
45 | "name": "jinaai/reader-lm-1.5b",
46 | "max_tokens": 200,
47 | "temperature": 0
48 | }
49 | }
50 | }
51 | }
--------------------------------------------------------------------------------
/config/stabilityai_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "stabilityai",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/stable-diffusion-xl-base-1.0.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "stabilityai",
5 | "name": "stable-diffusion-xl-base-1.0",
6 | "object": {
7 | "spec": {
8 | "image": "vllm-openai-upgraded:v0.1.0",
9 | "entrypoint": [
10 | "/usr/bin/python3"
11 | ],
12 | "commands": [
13 | "/usr/lib/run_stablediffusion.py"
14 | ],
15 | "resources": {
16 | "CPU": 20000,
17 | "Mem": 50000,
18 | "GPU": {
19 | "Type": "Any",
20 | "Count": 1,
21 | "vRam": 13800
22 | }
23 | },
24 | "envs": [
25 | [
26 | "height",
27 | "512"
28 | ],
29 | [
30 | "width",
31 | "512"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "port": 8000,
42 | "schema": "Http",
43 | "probe": "/health"
44 | },
45 | "sample_query": {
46 | "apiType": "text2img",
47 | "prompt": "An astronaut riding a green horse",
48 | "path": "funccall",
49 | "body": {}
50 | },
51 | "standby": {
52 | "gpu": "Blob",
53 | "pageable": "Blob",
54 | "pinned": "Blob"
55 | }
56 | }
57 | }
58 | }
--------------------------------------------------------------------------------
/config/stablelm-3b-4e1t.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "stablelm-3b-4e1t",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "stabilityai/stablelm-3b-4e1t",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--max-model-len",
15 | "2000"
16 | ],
17 | "resources": {
18 | "CPU": 6000,
19 | "Mem": 18000,
20 | "GPU": {
21 | "Type": "Any",
22 | "Count": 1,
23 | "vRam": 8000
24 | }
25 | },
26 | "envs": [
27 | [
28 | "LD_LIBRARY_PATH",
29 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 | ]
31 | ],
32 | "mounts": [
33 | {
34 | "hostpath": "/home/brad/cache",
35 | "mountpath": "/root/.cache/huggingface"
36 | }
37 | ],
38 | "endpoint": {
39 | "path": "/v1/completions",
40 | "port": 8000,
41 | "schema": "Http"
42 | },
43 | "probe": {
44 | "path": "/health",
45 | "port": 8000,
46 | "schema": "Http"
47 | },
48 | "api_type": {
49 | "openai": {
50 | "name": "stabilityai/stablelm-3b-4e1t",
51 | "max_tokens": 1000,
52 | "temperature": 0
53 | }
54 | },
55 | "keepalive": "Blob"
56 | }
57 | }
58 | }
--------------------------------------------------------------------------------
/config/stablelm-tuned-alpha-7b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "t1",
4 | "namespace": "ns1",
5 | "name": "stablelm-tuned-alpha-7b",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "stabilityai/stablelm-tuned-alpha-7b",
12 | "--enforce-eager",
13 | "--disable-custom-all-reduce",
14 | "--trust-remote-code",
15 | "--max-model-len",
16 | "2000",
17 | "--tensor-parallel-size=2"
18 | ],
19 | "resources": {
20 | "CPU": 6000,
21 | "Mem": 50000,
22 | "GPU": {
23 | "Type": "Any",
24 | "Count": 2,
25 | "vRam": 15000
26 | }
27 | },
28 | "envs": [
29 | [
30 | "LD_LIBRARY_PATH",
31 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 | ]
33 | ],
34 | "mounts": [
35 | {
36 | "hostpath": "/home/brad/cache",
37 | "mountpath": "/root/.cache/huggingface"
38 | }
39 | ],
40 | "endpoint": {
41 | "path": "/v1/completions",
42 | "port": 8000,
43 | "schema": "Http"
44 | },
45 | "probe": {
46 | "path": "/health",
47 | "port": 8000,
48 | "schema": "Http"
49 | },
50 | "api_type": {
51 | "openai": {
52 | "name": "stabilityai/stablelm-tuned-alpha-7b",
53 | "max_tokens": 1000,
54 | "temperature": 0
55 | }
56 | },
57 | "keepalive": "Blob"
58 | }
59 | }
60 | }
--------------------------------------------------------------------------------
/config/starcoder2-3b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "bigcode",
5 | "name": "starcoder2-3b",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "bigcode/starcoder2-3b",
12 | "--disable-custom-all-reduce",
13 | "--max-model-len",
14 | "2000"
15 | ],
16 | "resources": {
17 | "CPU": 12000,
18 | "Mem": 50000,
19 | "GPU": {
20 | "Type": "Any",
21 | "Count": 1,
22 | "vRam": 13800
23 | }
24 | },
25 | "envs": [
26 | [
27 | "LD_LIBRARY_PATH",
28 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 | ],
30 | [
31 | "VLLM_CUDART_SO_PATH",
32 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 | ]
34 | ],
35 | "mounts": [
36 | {
37 | "hostpath": "/home/brad/cache",
38 | "mountpath": "/root/.cache/huggingface"
39 | }
40 | ],
41 | "endpoint": {
42 | "port": 8000,
43 | "schema": "Http",
44 | "probe": "/health"
45 | },
46 | "sample_query": {
47 | "apiType": "openai",
48 | "prompt": "def print_hello_world():",
49 | "path": "v1/completions",
50 | "body": {
51 | "model": "bigcode/starcoder2-3b",
52 | "max_tokens": "1000",
53 | "temperature": "0",
54 | "stream": "true"
55 | }
56 | },
57 | "standby": {
58 | "gpu": "Blob",
59 | "pageable": "Blob",
60 | "pinned": "Blob"
61 | }
62 | }
63 | }
64 | }
--------------------------------------------------------------------------------
/config/starcoder2-7b.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "function",
3 | "tenant": "public",
4 | "namespace": "bigcode",
5 | "name": "starcoder2-7b",
6 | "object": {
7 | "spec": {
8 | "image": "vllm/vllm-openai:v0.7.3",
9 | "commands": [
10 | "--model",
11 | "bigcode/starcoder2-7b",
12 | "--disable-custom-all-reduce",
13 | "--trust-remote-code",
14 | "--max-model-len",
15 | "2000",
16 | "--tensor-parallel-size=2"
17 | ],
18 | "resources": {
19 | "CPU": 20000,
20 | "Mem": 50000,
21 | "GPU": {
22 | "Type": "Any",
23 | "Count": 2,
24 | "vRam": 13800
25 | }
26 | },
27 | "envs": [
28 | [
29 | "LD_LIBRARY_PATH",
30 | "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 | ],
32 | [
33 | "VLLM_CUDART_SO_PATH",
34 | "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 | ]
36 | ],
37 | "mounts": [
38 | {
39 | "hostpath": "/home/brad/cache",
40 | "mountpath": "/root/.cache/huggingface"
41 | }
42 | ],
43 | "endpoint": {
44 | "port": 8000,
45 | "schema": "Http",
46 | "probe": "/health"
47 | },
48 | "sample_query": {
49 | "apiType": "openai",
50 | "prompt": "def print_hello_world():",
51 | "path": "v1/completions",
52 | "body": {
53 | "model": "bigcode/starcoder2-7b",
54 | "max_tokens": "1000",
55 | "temperature": "0",
56 | "stream": "true"
57 | }
58 | },
59 | "standby": {
60 | "gpu": "Blob",
61 | "pageable": "Blob",
62 | "pinned": "Blob"
63 | }
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/config/state-spaces_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "state-spaces",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/tenant1.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "tenant",
3 | "tenant": "system",
4 | "namespace": "system",
5 | "name": "t1",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/config/tiiuae_namespace.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "namespace",
3 | "tenant": "public",
4 | "namespace": "system",
5 | "name": "tiiuae",
6 | "object": {
7 | "spec": {},
8 | "status": {
9 | "disable": false
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/dashboard/Makefile:
--------------------------------------------------------------------------------
1 | # pip install grpcio grpcio-tools
2 | # pip install psycopg2-binary
3 | all: protoc
4 | run:
5 | KEYCLOAK_URL=http://192.168.0.22:1260/authn python3 ./app.py
6 |
7 | protoc:
8 | python3 -m grpc_tools.protoc -I ../qshare/proto --python_out=. --grpc_python_out=. qobjs.proto
9 | python3 -m grpc_tools.protoc -I ../qshare/proto --python_out=. --grpc_python_out=. na.proto
10 |
--------------------------------------------------------------------------------
/dashboard/__pycache__/na_pb2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/na_pb2.cpython-38.pyc
--------------------------------------------------------------------------------
/dashboard/__pycache__/na_pb2_grpc.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/na_pb2_grpc.cpython-38.pyc
--------------------------------------------------------------------------------
/dashboard/__pycache__/qobjs_pb2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/qobjs_pb2.cpython-38.pyc
--------------------------------------------------------------------------------
/dashboard/__pycache__/qobjs_pb2_grpc.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/qobjs_pb2_grpc.cpython-38.pyc
--------------------------------------------------------------------------------
/dashboard/doc:
--------------------------------------------------------------------------------
1 | ../doc
--------------------------------------------------------------------------------
/dashboard/gunicorn.conf.py:
--------------------------------------------------------------------------------
1 | bind = "0.0.0.0:1250"
2 | workers = 4
3 | worker_class = "gevent"
4 | timeout = 30
--------------------------------------------------------------------------------
/dashboard/requirements.txt:
--------------------------------------------------------------------------------
1 | aiofiles==23.1.0
2 | async-generator==1.10
3 | blinker==1.6.2
4 | exceptiongroup==1.1.1
5 | Flask==2.3.2
6 | grpcio==1.54.2
7 | grpcio-tools==1.54.2
8 | h2==3.2.0
9 | hpack==3.0.0
10 | html5tagger==1.3.0
11 | httptools==0.5.0
12 | hyperframe==5.2.0
13 | itsdangerous==2.1.2
14 | janus==1.0.0
15 | Jinja2==3.1.2
16 | MarkupSafe==2.1.3
17 | multidict==6.0.4
18 | numpy==1.24.3
19 | protobuf==4.23.2
20 | purerpc==0.8.0
21 | sanic==23.3.0
22 | sanic-routing==22.8.0
23 | sniffio==1.3.0
24 | tracerite==1.1.0
25 | typing_extensions==4.6.3
26 | ujson==5.7.0
27 | uvloop==0.17.0
28 | websockets==11.0.3
29 | Werkzeug==2.3.6
30 | flask-cors==4.0.1
31 | requests==2.25.1
32 | markdown==3.7
33 | gunicorn==23.0.0
34 | gevent==24.2.1
35 | Authlib==1.3.2
36 |
--------------------------------------------------------------------------------
/dashboard/sql/kv.sql:
--------------------------------------------------------------------------------
1 | DROP DATABASE auditdb;
2 | CREATE DATABASE auditdb;
3 |
4 | \c auditdb;
5 |
6 | DROP TABLE Pod;
7 | CREATE TABLE Pod (
8 | tenant VARCHAR NOT NULL,
9 | namespace VARCHAR NOT NULL,
10 | fpname VARCHAR NOT NULL,
11 | fprevision bigint,
12 | id VARCHAR NOT NULL,
13 | nodename VARCHAR NOT NULL,
14 | state VARCHAR NOT NULL,
15 | updatetime TIMESTAMP,
16 | PRIMARY KEY(tenant, namespace, fpname, fprevision, id)
17 | );
18 |
19 | DROP TABLE PodAudit;
20 | CREATE TABLE PodAudit (
21 | tenant VARCHAR NOT NULL,
22 | namespace VARCHAR NOT NULL,
23 | fpname VARCHAR NOT NULL,
24 | fprevision bigint,
25 | id VARCHAR NOT NULL,
26 | nodename VARCHAR NOT NULL,
27 | action VARCHAR NOT NULL,
28 | state VARCHAR NOT NULL,
29 | updatetime TIMESTAMP,
30 | PRIMARY KEY(tenant, namespace, fpname, fprevision, id, updatetime)
31 | );
32 |
33 | DROP TABLE ReqAudit;
34 | CREATE TABLE ReqAudit (
35 | seqid SERIAL PRIMARY KEY,
36 | podkey VARCHAR NOT NULL,
37 | audittime TIMESTAMP,
38 | keepalive bool,
39 | ttft int, -- Time to First Token
40 | latency int
41 | );
42 |
43 | CREATE USER audit_user WITH PASSWORD '123456';
44 | GRANT ALL ON ALL TABLES IN SCHEMA public to audit_user;
45 | GRANT USAGE ON SEQUENCE reqaudit_seqid_seq TO audit_user;
46 |
47 | -- https://stackoverflow.com/questions/18664074/getting-error-peer-authentication-failed-for-user-postgres-when-trying-to-ge
48 |
49 | DROP DATABASE testdb;
50 | CREATE DATABASE testdb;
51 |
52 | \c testdb;
53 |
54 | DROP TABLE Pod;
55 | CREATE TABLE Pod (
56 | tenant VARCHAR NOT NULL
57 | );
58 |
59 | insert into pod values ('asdf');
60 |
61 | CREATE OR REPLACE FUNCTION notification_trigger() RETURNS TRIGGER AS
62 | $$
63 | BEGIN
64 | PERFORM pg_notify('your_channel_name',
65 | to_json(NEW)::TEXT
66 | );
67 | RETURN NEW;
68 | END;
69 | $$ LANGUAGE plpgsql;
70 |
71 | CREATE OR REPLACE TRIGGER capture_change_trigger AFTER INSERT OR UPDATE OR DELETE ON pod
72 | FOR EACH ROW EXECUTE FUNCTION notification_trigger();
73 |
74 |
--------------------------------------------------------------------------------
/dashboard/sql/secret.sql:
--------------------------------------------------------------------------------
1 | --DROP TABLE ApiKey;
2 | CREATE TABLE Apikey (
3 | apikey VARCHAR NOT NULL,
4 | username VARCHAR NOT NULL,
5 | keyname VARCHAR NOT NULL,
6 | createtime TIMESTAMP,
7 | PRIMARY KEY(apikey)
8 | );
9 |
10 | CREATE UNIQUE INDEX apikey_idx_realm_username ON Apikey (username, keyname);
11 |
12 | CREATE TABLE UserRole (
13 | username VARCHAR NOT NULL,
14 | rolename VARCHAR NOT NULL,
15 | PRIMARY KEY(username, rolename)
16 | );
17 |
18 | CREATE INDEX userrole_idx_rolename ON UserRole (rolename);
19 |
--------------------------------------------------------------------------------
/dashboard/static/button.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/static/button.gif
--------------------------------------------------------------------------------
/dashboard/templates/index.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 |
5 |
func audit
6 |
7 |
14 |
15 | {% endblock %}
--------------------------------------------------------------------------------
/dashboard/templates/log.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 | func log
5 |
6 |
11 |
12 |
13 | namespace |
14 | func name |
15 | func id |
16 |
17 |
18 | {{ namespace }} |
19 | {{ funcName }} |
20 | {{ funcId }} |
21 |
22 |
23 |
24 | {{ log | safe }}
25 | {% endblock %}
--------------------------------------------------------------------------------
/dashboard/templates/markdown.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 |
5 |
6 |
13 |
14 |
15 | {{ md_content|safe }} |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | {{ log | safe }}
24 | {% endblock %}
--------------------------------------------------------------------------------
/dashboard/templates/node.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 | Node
5 |
6 |
13 |
14 |
15 | Node Name |
16 | Node |
17 |
18 |
19 | {{ name }} |
20 | {% autoescape false %}
21 | {{ node }} |
22 | {% endautoescape %}
23 |
24 |
25 |
26 | {{ log | safe }}
27 | {% endblock %}
--------------------------------------------------------------------------------
/dashboard/templates/node_list.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 | Nodes
5 |
12 |
13 |
14 | nodename |
15 | Ip Address |
16 | CIDR |
17 | CPU Count |
18 | CPU Memory (GB) |
19 | MaxContextPerGPU |
20 | BlobStore |
21 | GPUs |
22 |
23 | {% for node in nodes %}
24 |
25 | {{
26 | node["name"] }} |
27 | {% autoescape false %}
28 | {{ node['object']['nodeIp'] }} |
29 | {{ node['object']['cidr'] }} |
30 | {{ node['object']['resources']['CPU'] // 1000 }} |
31 | {{ node['object']['resources']['Mem'] // 1000 }} |
32 | {{ node['object']['resources']['MaxContextPerGPU'] }} |
33 | {{ node['object']['blobStoreEnable'] }} |
34 | {{ node['object']['resources']['GPUs'] }} |
35 | {% endautoescape %}
36 |
37 | {% endfor %}
38 |
39 | {{ hosturl }}
40 | {% endblock %}
--------------------------------------------------------------------------------
/dashboard/templates/pod.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 |
5 | funcpod
6 |
7 |
14 |
15 |
16 | tenant |
17 | namespace |
18 | podname |
19 |
20 |
21 | {{ tenant }} |
22 | {{ namespace }} |
23 | {{ podname }} |
24 |
25 |
26 |
27 | {% if audits %}
28 | state
29 |
30 |
31 | state |
32 | time |
33 |
34 | {% for audit in audits %}
35 |
36 | {{ audit["state"] }} |
37 | {{ audit["updatetime"] }} |
38 |
39 | {% endfor %}
40 |
41 | {% endif %}
42 | log
43 |
44 |
45 | {% autoescape false %}
46 | {{ log }} |
47 | {% endautoescape %}
48 |
49 |
50 |
51 |
59 | {{ hosturl }}
60 | {% endblock %}
--------------------------------------------------------------------------------
/dashboard/templates/snapshot_list.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 |
11 |
12 | Snapshots
13 |
14 |
15 | Snapshot Id |
16 | nodename |
17 | state |
18 | gpu |
19 | pageable |
20 | pinned |
21 | docker image name |
22 | build id |
23 |
24 | {% for snapshot in snapshots %}
25 |
26 | {{ snapshot["name"] }} |
27 | {{ snapshot["object"]['nodename'] }} |
28 | {{ snapshot["object"]['state'] }} |
29 | {{ snapshot["object"]['info']['gpuMemSizes'] }} |
30 | {{ snapshot["object"]['info']['processCheckpointSize'] // (1024*1024) }} MB |
31 | {{ snapshot["object"]['info']['hostMemSize'] // (1024*1024) }} MB |
32 | {{ snapshot["object"]['meta']['imagename'] }} |
33 | {{ snapshot["object"]['meta']['buildId'] }} |
34 |
35 | {% endfor %}
36 |
37 | {{ hosturl }}
38 | {% endblock %}
--------------------------------------------------------------------------------
/deployment/dashboard.Dockerfile:
--------------------------------------------------------------------------------
1 | # syntax=docker/dockerfile:1
2 |
3 | FROM python:3.10-slim-buster
4 |
5 | WORKDIR /
6 |
7 | RUN apt-get -y update
8 | RUN apt-get install -y libpq-dev gcc
9 | RUN apt-get install -y bash
10 | RUN apt-get install -y nginx
11 | RUN apt-get install -y curl
12 |
13 | COPY requirements.txt requirements.txt
14 | RUN pip3 install -r requirements.txt
15 |
16 | COPY . .
17 |
18 | COPY nginx.conf /etc/nginx/sites-available/default
19 |
20 | CMD service nginx start && gunicorn -w 4 -b 0.0.0.0:1250 app:app
21 | # CMD service nginx start && python3 ./app.py
22 | # CMD python3 ./app.py
--------------------------------------------------------------------------------
/deployment/llava.Dockerfile:
--------------------------------------------------------------------------------
1 | # syntax=docker/dockerfile:1
2 |
3 | FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
4 | #FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04
5 | WORKDIR /
6 | RUN apt-get -y update
7 | RUN apt-get -y install libnuma-dev fuse3 libkeyutils-dev libaio-dev
8 |
9 | COPY onenode_logging_config.yaml /opt/inferx/config/onenode_logging_config.yaml
10 | COPY node.json /opt/inferx/config/node.json
11 | COPY libnvmedrv.so /usr/lib/libnvmedrv.so
12 | COPY . .
13 | CMD ["./onenode"]
--------------------------------------------------------------------------------
/deployment/one.Dockerfile:
--------------------------------------------------------------------------------
1 | # syntax=docker/dockerfile:1
2 |
3 | #FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04
4 | FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
5 | WORKDIR /
6 | RUN apt-get -y update
7 | RUN apt-get -y install libnuma-dev
8 | RUN apt-get -y install fuse3
9 | RUN apt-get -y install libkeyutils-dev
10 | RUN apt-get -y install libaio-dev
11 | # RUN apt-get -y install libssl3
12 | RUN apt-get -y install libssl-dev
13 |
14 | COPY onenode_logging_config.yaml /opt/inferx/config/onenode_logging_config.yaml
15 | COPY node.json /opt/inferx/config/node.json
16 | COPY libnvmedrv.so /usr/lib/libnvmedrv.so
17 | COPY . .
18 | CMD ["./onenode"]
--------------------------------------------------------------------------------
/deployment/spdk.Dockerfile:
--------------------------------------------------------------------------------
1 | # Use Ubuntu as the base image
2 | FROM ubuntu:22.04
3 |
4 | # Set environment variables
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | # Install dependencies
8 | RUN apt-get update && apt-get install -y \
9 | build-essential \
10 | git \
11 | gcc \
12 | make \
13 | libaio-dev \
14 | libpciaccess-dev \
15 | python3 \
16 | python3-pip \
17 | pciutils \
18 | pkg-config kmod \
19 | libjson-c-dev libcunit1-dev libssl-dev libcmocka-dev uuid-dev libiscsi-dev libkeyutils-dev libncurses5-dev libncursesw5-dev unzip libfuse3-dev patchelf \
20 | python3-configshell-fb python3-pexpect nasm libnuma-dev \
21 | autoconf automake libtool help2man systemtap-sdt-dev \
22 | astyle lcov clang sg3-utils shellcheck abigail-tools bash-completion ruby-dev pycodestyle bundler rake python3-paramiko curl \
23 | libpmem-dev libpmemblk-dev libpmemobj-dev \
24 | librados-dev librbd-dev libibverbs-dev librdmacm-dev
25 |
26 | # Clone the SPDK repository
27 | RUN git clone https://github.com/spdk/spdk.git /spdk --recursive
28 |
29 | # Set working directory
30 | WORKDIR /spdk
31 |
32 | RUN ./scripts/pkgdep.sh --all
33 | RUN ./configure
34 | RUN make
35 |
36 | # Set up entrypoint to provide SPDK CLI tools
37 | ENTRYPOINT scripts/gen_nvme.sh --json-with-subsystems > /opt/inferx/config/nvme_bdev_all.json && scripts/setup.sh
38 |
--------------------------------------------------------------------------------
/deployment/spdk.script:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e # exit immediately if any command fails
3 |
4 | echo "Generating NVMe config..."
5 | scripts/gen_nvme.sh --json-with-subsystems > /opt/inferx/config/nvme_bdev_all.json
6 | sleep 1
7 | echo "Running SPDK setup..."
8 | scripts/setup.sh
9 | sleep 1
10 | scripts/gen_nvme.sh --json-with-subsystems > /opt/inferx/config/nvme_bdev_all.json
11 | sleep 1
12 | scripts/setup.sh
13 | echo "SPDK setup complete."
14 |
15 | while true; do sleep 86400; done
16 |
--------------------------------------------------------------------------------
/deployment/spdk2.Dockerfile:
--------------------------------------------------------------------------------
1 | # Use Ubuntu as the base image
2 | FROM inferx/spdk-container:v0.1.0
3 |
4 | # Set environment variables
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | COPY entrypoint.sh /spdk/entrypoint.sh
8 |
9 | # Set working directory
10 | WORKDIR /spdk
11 |
12 |
13 | # Set up entrypoint to provide SPDK CLI tools
14 | ENTRYPOINT bash /spdk/entrypoint.sh
15 |
--------------------------------------------------------------------------------
/deployment/vllm-opai.Dockerfile:
--------------------------------------------------------------------------------
1 | # docker build -t vllm-openai-upgraded .
2 | FROM vllm/vllm-openai:v0.7.3
3 | WORKDIR /
4 | # Upgrade the transformers library
5 | RUN apt-get -y update
6 | RUN apt-get install libglib2.0-0 -y
7 | RUN apt-get install libgl1 -y
8 |
9 | RUN pip install --upgrade transformers
10 | RUN pip install --upgrade safetensors
11 | RUN pip install diffusers --upgrade
12 | RUN pip install invisible_watermark accelerate
13 |
14 | COPY run_model.py /usr/lib/run_model.py
15 | COPY run_llava.py /usr/lib/run_llava.py
16 | COPY run_stablediffusion.py /usr/lib/run_stablediffusion.py
17 |
18 |
--------------------------------------------------------------------------------
/doc/GPUSnapshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/GPUSnapshot.png
--------------------------------------------------------------------------------
/doc/architect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/architect.png
--------------------------------------------------------------------------------
/doc/comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/comparison.png
--------------------------------------------------------------------------------
/doc/daemon.json:
--------------------------------------------------------------------------------
1 | {
2 | "runtimes": {
3 | "nvidia": {
4 | "args": [],
5 | "path": "nvidia-container-runtime"
6 | },
7 | "inferx": {
8 | "path": "/opt/inferx/bin/inferx"
9 | }
10 | }
11 | }
--------------------------------------------------------------------------------
/doc/home.md:
--------------------------------------------------------------------------------
1 | ../README.md
--------------------------------------------------------------------------------
/doc/infer_Profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/infer_Profile.png
--------------------------------------------------------------------------------
/doc/keycloak.md:
--------------------------------------------------------------------------------
1 | 1. Create Realm "Inferx"
2 | 2. Create Client "infer_client" in Realm "Inferx"
3 | a. Enable Client authentication
4 | b. Add Valid redirect URI
5 | https://inferx.net:8000/*
6 | http://:1250/*
7 | http://:81/*
8 | http://:4000/*
9 | c. Add web origins
10 | https://inferx.net:8000
11 | http://:1250
12 | http://:81
13 | http://:4000
14 | d. Enable "Direct Access Grants Enabled"
15 | 3. Update KEYCLOAK_CLIENT_SECRET in docker-compose_blob.yml
16 | 4. Update the KEYCLOAK_URL with local address
17 |
18 |
19 | curl -X POST "http://192.168.0.22:1260/authn/realms/inferx/protocol/openid-connect/token" \
20 | -H "Content-Type: application/x-www-form-urlencoded" \
21 | -d "client_id=infer_client" \
22 | -d "client_secret=M2Dse5531tdtyipZdGizLEeoOVgziQRX" \
23 | -d "username=testuser1" \
24 | -d "password=test" \
25 | -d "grant_type=password"
26 |
27 |
--------------------------------------------------------------------------------
/doc/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/logo.png
--------------------------------------------------------------------------------
/doc/logo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/logo1.png
--------------------------------------------------------------------------------
/doc/logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/logo2.png
--------------------------------------------------------------------------------
/inferxlib/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "inferxlib"
3 | version = "0.1.0"
4 | edition = "2021"
5 |
6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7 |
8 | [dependencies]
9 | serde = { version = "1.0", features = ["derive"] }
10 | serde_json = "1.0"
11 | regex = "1.7.1"
12 | bollard = "=0.17.0"
13 | reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
14 | log = "0.4.17"
15 | log4rs = "1"
16 |
17 | [dependencies.lazy_static]
18 | version = "1.0"
19 | features = ["spin_no_std"]
20 |
--------------------------------------------------------------------------------
/inferxlib/src/common.rs:
--------------------------------------------------------------------------------
1 | use serde_json::Error as SerdeJsonError;
2 |
3 | pub type Result = core::result::Result;
4 |
5 | #[derive(Debug)]
6 | pub enum Error {
7 | CommonError(String),
8 | NotExist(String),
9 | Exist(String),
10 | SchedulerNoEnoughResource(String),
11 | SerdeJsonError(SerdeJsonError),
12 | StdIOErr(std::io::Error),
13 | ReqWestErr(reqwest::Error),
14 | }
15 |
16 | impl From for Error {
17 | fn from(item: SerdeJsonError) -> Self {
18 | return Self::SerdeJsonError(item);
19 | }
20 | }
21 |
22 | impl From for Error {
23 | fn from(item: std::io::Error) -> Self {
24 | return Self::StdIOErr(item);
25 | }
26 | }
27 |
28 | impl From for Error {
29 | fn from(item: reqwest::Error) -> Self {
30 | return Self::ReqWestErr(item);
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/inferxlib/src/lib.rs:
--------------------------------------------------------------------------------
1 | #![allow(dead_code)]
2 | #![allow(non_snake_case)]
3 | #![allow(non_upper_case_globals)]
4 | #![allow(non_camel_case_types)]
5 | #![allow(deprecated)]
6 | #![allow(unused_imports)]
7 |
8 | #[macro_use]
9 | extern crate log;
10 |
11 | pub mod common;
12 | pub mod data_obj;
13 | pub mod node;
14 | pub mod obj_mgr;
15 | pub mod resource;
16 | pub mod selector;
17 | pub mod validation;
18 |
--------------------------------------------------------------------------------
/inferxlib/src/obj_mgr/cidrlock.rs:
--------------------------------------------------------------------------------
1 | use serde::{Deserialize, Serialize};
2 |
3 | use crate::resource::NodeResources;
4 |
5 | use crate::data_obj::*;
6 |
7 | #[derive(Serialize, Deserialize, Debug, Clone, Default)]
8 | pub struct CidrlockSpec {}
9 |
--------------------------------------------------------------------------------
/inferxlib/src/obj_mgr/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021 Quark Container Authors / 2014 The Kubernetes Authors
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | pub mod cidrlock;
16 | pub mod func_mgr;
17 | pub mod funcsnapshot_mgr;
18 | pub mod namespace_mgr;
19 | pub mod node_mgr;
20 | pub mod pod_mgr;
21 | pub mod tenant_mgr;
22 |
--------------------------------------------------------------------------------
/inferxlib/src/obj_mgr/namespace_mgr.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021 Quark Container Authors / 2014 The Kubernetes Authors
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use serde::{Deserialize, Serialize};
16 |
17 | use crate::data_obj::*;
18 |
19 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
20 | pub struct NamespaceObject {
21 | pub spec: NamespaceSpec,
22 | pub status: NamespaceStatus,
23 | }
24 |
25 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
26 | pub struct NamespaceStatus {
27 | pub disable: bool,
28 | }
29 |
30 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
31 | pub struct NamespaceSpec {}
32 |
33 | pub type Namespace = DataObject;
34 | pub type NamespaceMgr = DataObjectMgr;
35 |
36 | impl Namespace {
37 | pub const KEY: &'static str = "namespace";
38 | }
39 |
--------------------------------------------------------------------------------
/inferxlib/src/obj_mgr/node_mgr.rs:
--------------------------------------------------------------------------------
1 | use serde::{Deserialize, Serialize};
2 |
3 | use crate::resource::NodeResources;
4 |
5 | use crate::data_obj::*;
6 |
7 | #[derive(Serialize, Deserialize, Debug, Clone, Default)]
8 | pub struct NodeSpec {
9 | pub nodeIp: String,
10 | pub podMgrPort: u16,
11 | pub tsotSvcPort: u16,
12 | pub stateSvcPort: u16,
13 | pub cidr: String,
14 | pub resources: NodeResources,
15 | pub blobStoreEnable: bool,
16 | }
17 |
18 | pub type Node = DataObject;
19 | pub type NodeMgr = DataObjectMgr;
20 |
21 | impl Node {
22 | pub const KEY: &'static str = "node_info";
23 | pub const TENANT: &'static str = "system";
24 | pub const NAMESPACE: &'static str = "system";
25 |
26 | pub fn QletUrl(&self) -> String {
27 | return format!("http://{}:{}", self.object.nodeIp, self.object.podMgrPort);
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/inferxlib/src/obj_mgr/tenant_mgr.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021 Quark Container Authors / 2014 The Kubernetes Authors
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use serde::{Deserialize, Serialize};
16 |
17 | use crate::data_obj::*;
18 |
19 | pub const SYSTEM_TENANT: &str = "system";
20 | pub const SYSTEM_NAMESPACE: &str = "system";
21 |
22 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
23 | pub struct TenantObject {
24 | pub spec: TenantSpec,
25 | pub status: TenantStatus,
26 | }
27 |
28 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
29 | pub struct TenantStatus {
30 | pub disable: bool,
31 | }
32 |
33 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
34 | pub struct TenantSpec {}
35 |
36 | pub type Tenant = DataObject;
37 |
38 | impl Tenant {
39 | pub const KEY: &'static str = "tenant";
40 | }
41 |
42 | pub type TenantMgr = DataObjectMgr;
43 |
--------------------------------------------------------------------------------
/ixctl_logging_config.yaml:
--------------------------------------------------------------------------------
1 | appenders:
2 | my_stdout:
3 | kind: console
4 | encoder:
5 | pattern: "{h({d(%Y-%m-%d %H:%M:%S)(utc)} - {l}: {m}{n})}"
6 | my_file_logger:
7 | kind: rolling_file
8 | path: "/opt/inferx/log/ixctl.log"
9 | encoder:
10 | pattern: "{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}"
11 | policy:
12 | trigger:
13 | kind: size
14 | limit: 50mb
15 | roller:
16 | kind: delete
17 | append_logger:
18 | kind: file
19 | path: "/opt/inferx/log/ixctl.log"
20 | append: true
21 | encoder:
22 | pattern: "{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}"
23 | root:
24 | level: info
25 | appenders:
26 | - append_logger
27 |
--------------------------------------------------------------------------------
/k8s/clean-k3sagent.sh:
--------------------------------------------------------------------------------
1 | # Stop K3s service if running
2 | sudo systemctl stop k3s-agent || true
3 |
4 | # Run the uninstall script if present
5 | sudo /usr/local/bin/k3s-agent-uninstall.sh || true
6 |
7 | # Clean residual data
8 | sudo rm -rf /etc/rancher/k3s /var/lib/rancher/k3s /var/lib/kubelet /etc/systemd/system/k3s-agent.service /usr/local/bin/k3s*
9 |
10 | # Optionally clean containerd data if used before
11 | # sudo rm -rf /var/lib/containerd
12 |
13 | echo "K3s agent cleanup complete."
--------------------------------------------------------------------------------
/k8s/cleanup-k3s.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euxo pipefail
3 |
4 | ### 1. Stop & Uninstall K3s
5 | if command -v k3s-uninstall.sh &> /dev/null; then
6 | sudo /usr/local/bin/k3s-uninstall.sh # Stops k3s, removes services, data, etc. :contentReference[oaicite:0]{index=0}
7 | fi
8 | if command -v k3s-agent-uninstall.sh &> /dev/null; then
9 | sudo /usr/local/bin/k3s-agent-uninstall.sh # Removes agent components on workers :contentReference[oaicite:1]{index=1}
10 | fi
11 |
12 | ### 2. Kill any remaining processes
13 | if command -v k3s-killall.sh &> /dev/null; then
14 | sudo /usr/local/bin/k3s-killall.sh # Kills k3s-related processes, containerd, etc. :contentReference[oaicite:2]{index=2}
15 | fi
16 |
17 | ### 3. Remove leftover dirs and configs
18 | # sudo rm -rf /etc/rancher/k3s /var/lib/rancher/k3s /var/lib/kubelet
19 | # /etc/containerd /var/lib/containerd # Clean containerd and K3s state :contentReference[oaicite:3]{index=3}
20 |
21 | sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/systemd/system/k3s* /var/lib/containerd /etc/cni /opt/cni
22 |
23 |
24 | ### 4. Restart containerd to clear any stuck state
25 | sudo systemctl restart containerd # Ensures containerd is fresh :contentReference[oaicite:4]{index=4}
26 |
27 | echo "✔️ K3s and related components have been fully removed."
28 |
--------------------------------------------------------------------------------
/k8s/dashboard.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: inferx-dashboard
5 | labels:
6 | app: inferx-dashboard
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | app: inferx-dashboard
12 | template:
13 | metadata:
14 | labels:
15 | app: inferx-dashboard
16 | spec:
17 | containers:
18 | - name: inferx-dashboard
19 | image: inferx/inferx_dashboard:v0.1.1
20 | imagePullPolicy: IfNotPresent
21 | env:
22 | - name: KEYCLOAK_URL
23 | value: "http://192.168.0.22:31260/authn"
24 | - name: KEYCLOAK_REALM_NAME
25 | value: "inferx"
26 | - name: KEYCLOAK_CLIENT_ID
27 | value: "infer_client"
28 | - name: KEYCLOAK_CLIENT_SECRET
29 | value: "M2Dse5531tdtyipZdGizLEeoOVgziQRX"
30 | - name: INFERX_APIGW_ADDR
31 | value: "http://nodeagent:4000"
32 | volumeMounts:
33 | - name: cert-volume
34 | mountPath: /etc/letsencrypt/
35 | livenessProbe:
36 | httpGet:
37 | path: /intro?name=home.md
38 | port: 1250
39 | initialDelaySeconds: 10
40 | periodSeconds: 10
41 | volumes:
42 | - name: cert-volume
43 | hostPath:
44 | path: /etc/letsencrypt/
45 | ---
46 | apiVersion: v1
47 | kind: Service
48 | metadata:
49 | name: inferx-dashboard
50 | spec:
51 | type: NodePort
52 | selector:
53 | app: inferx-dashboard
54 | ports:
55 | - name: http
56 | port: 1250
57 | targetPort: 1250
58 | nodePort: 31250
--------------------------------------------------------------------------------
/k8s/db-deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: db-pvc
5 | spec:
6 | accessModes:
7 | - ReadWriteOnce
8 | resources:
9 | requests:
10 | storage: 1Gi
11 | ---
12 | apiVersion: apps/v1
13 | kind: Deployment
14 | metadata:
15 | name: db
16 | spec:
17 | replicas: 1
18 | selector:
19 | matchLabels:
20 | app: db
21 | template:
22 | metadata:
23 | labels:
24 | app: db
25 | spec:
26 | nodeSelector:
27 | inferx_storage: data
28 | containers:
29 | - name: postgres
30 | image: postgres:14.5
31 | imagePullPolicy: IfNotPresent
32 | env:
33 | - name: POSTGRES_USER
34 | value: audit_user
35 | - name: POSTGRES_PASSWORD
36 | value: "123456"
37 | - name: POSTGRES_DB
38 | value: auditdb
39 | - name: PGDATA
40 | value: /data/postgres
41 | volumeMounts:
42 | - name: db-data
43 | mountPath: /data/postgres
44 | - name: init-sql
45 | mountPath: /docker-entrypoint-initdb.d/db.sql
46 | volumes:
47 | - name: db-data
48 | hostPath:
49 | path: /opt/inferx/data/postgres
50 | type: DirectoryOrCreate
51 | - name: init-sql
52 | hostPath:
53 | path: /opt/inferx/config/create_table.sql
54 | type: File
55 | ---
56 | apiVersion: v1
57 | kind: Service
58 | metadata:
59 | name: db
60 | spec:
61 | selector:
62 | app: db
63 | ports:
64 | - port: 5432
65 | targetPort: 5432
66 | nodePort: 30542
67 | type: NodePort
68 |
--------------------------------------------------------------------------------
/k8s/etcd.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: etcd
5 | labels:
6 | app: etcd
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | app: etcd
12 | template:
13 | metadata:
14 | labels:
15 | app: etcd
16 | spec:
17 | nodeSelector:
18 | inferx_storage: data
19 | containers:
20 | - name: etcd
21 | image: quay.io/coreos/etcd:v3.5.13
22 | imagePullPolicy: IfNotPresent
23 | volumeMounts:
24 | - name: etcd-data
25 | mountPath: /opt/inferx/data/etcd
26 | command: [ "etcd" ]
27 | args:
28 | - "--name=etcd-00"
29 | - "--data-dir=/opt/inferx/data/etcd"
30 | - "--advertise-client-urls=http://etcd-00:2379"
31 | - "--listen-client-urls=http://0.0.0.0:2379"
32 | - "--initial-advertise-peer-urls=http://etcd-00:2380"
33 | - "--listen-peer-urls=http://0.0.0.0:2380"
34 | - "--initial-cluster=etcd-00=http://etcd-00:2380"
35 | volumes:
36 | - name: etcd-data
37 | hostPath:
38 | path: /opt/inferx/data/etcd
39 | type: DirectoryOrCreate
40 | ---
41 | apiVersion: v1
42 | kind: Service
43 | metadata:
44 | name: etcd
45 | spec:
46 | selector:
47 | app: etcd
48 | ports:
49 | - port: 2379
50 | targetPort: 2379
51 |
--------------------------------------------------------------------------------
/k8s/ingress.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: networking.k8s.io/v1
2 | kind: Ingress
3 | metadata:
4 | name: inferx-ingress
5 | annotations:
6 | nginx.ingress.kubernetes.io/use-regex: "true"
7 | nginx.ingress.kubernetes.io/rewrite-target: /$1$2
8 | nginx.ingress.kubernetes.io/proxy-buffering: "off"
9 | nginx.ingress.kubernetes.io/proxy-request-buffering: "off"
10 | nginx.ingress.kubernetes.io/proxy-http-version: "1.1"
11 | nginx.ingress.kubernetes.io/proxy-chunked: "on"
12 | spec:
13 | rules:
14 | - http:
15 | paths:
16 | - path: /funccall/
17 | pathType: Prefix
18 | backend:
19 | service:
20 | name: nodeagent
21 | port:
22 | number: 4000
23 | - path: /authn/
24 | pathType: Prefix
25 | backend:
26 | service:
27 | name: keycloak
28 | port:
29 | number: 8080
30 | - path: /
31 | pathType: Prefix
32 | backend:
33 | service:
34 | name: inferx-dashboard
35 | port:
36 | number: 1250
37 | ports:
38 | web:
39 | port: 80
40 | hostPort: 80
41 | expose: true
42 | websecure:
43 | port: 443
44 | hostPort: 443
45 | expose: true
--------------------------------------------------------------------------------
/k8s/install-k3s.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 |
5 | ### 2. Install K3s using Docker runtime
6 | echo "[+] Installing K3s with Docker as container runtime..."
7 | curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--docker --node-external-ip=192.168.0.22" sh -
8 |
9 | echo "[+] Waiting for K3s to be ready..."
10 | sleep 10
11 | kubectl get node
12 |
13 | ### 3. Install Helm (if not installed)
14 | if ! command -v helm &> /dev/null; then
15 | echo "[+] Installing Helm..."
16 | curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash
17 | fi
18 |
19 | ### 4. Add NVIDIA Helm repo
20 | echo "[+] Adding NVIDIA Helm repo..."
21 | helm repo add nvidia https://nvidia.github.io/gpu-operator
22 | helm repo update
23 |
24 | ### 5. Deploy NVIDIA GPU Operator with Docker runtime
25 | echo "[+] Installing NVIDIA GPU Operator..."
26 | export KUBECONFIG=/etc/rancher/k3s/k3s.yam
27 | chmod 555 /etc/rancher/k3s/k3s.yaml
28 | helm install --wait gpu-operator \
29 | nvidia/gpu-operator \
30 | -n gpu-operator --create-namespace \
31 | --set operator.defaultRuntime=docker \
32 | --set driver.enabled=false \
33 | --set toolkit.enabled=true
34 |
35 | echo "[✓] K3s with Docker runtime and NVIDIA GPU Operator installed successfully."
36 |
--------------------------------------------------------------------------------
/k8s/join-k3sagent.sh:
--------------------------------------------------------------------------------
1 | # On server node
2 | # sudo cat /var/lib/rancher/k3s/server/node-token
3 | # hostname -I # Use internal IP accessible by the joining node
4 |
5 |
6 | sudo /usr/local/bin/k3s-agent-uninstall.sh
7 | sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /var/lib/cni /run/flannel
8 |
9 |
10 |
11 | curl -sfL https://get.k3s.io | K3S_URL=https://192.168.0.22:6443 \
12 | K3S_TOKEN=K106218814e0f9ea4c0b067750e725aee4a2921804a6867b625abb51b5c11149e9a::server:5401cee22c6fd5315c24574784b8d8a1 \
13 | INSTALL_K3S_EXEC="--docker --with-node-id" sh -
14 |
15 | sudo k3s agent --docker \
16 | --server https://192.168.0.22:6443 \
17 | --token K106218814e0f9ea4c0b067750e725aee4a2921804a6867b625abb51b5c11149e9a::server:5401cee22c6fd5315c24574784b8d8a1 \
18 | --with-node-id \
19 | --node-name inferx-agent1 \
20 | --debug
21 |
22 |
23 | # sudo k3s agent --docker --server https://192.168.0.22:6443 --token K106218814e0f9ea4c0b067750e725aee4a2921804a6867b625abb51b5c11149e9a::server:5401cee22c6fd5315c24574784b8d8a1 --debug
24 |
25 |
--------------------------------------------------------------------------------
/k8s/keycloak.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: keycloak
5 | spec:
6 | replicas: 1
7 | selector:
8 | matchLabels:
9 | app: keycloak
10 | template:
11 | metadata:
12 | labels:
13 | app: keycloak
14 | spec:
15 | containers:
16 | - name: keycloak
17 | image: quay.io/keycloak/keycloak:latest
18 | imagePullPolicy: IfNotPresent
19 | args: ["start-dev", "--verbose"]
20 | env:
21 | - name: KEYCLOAK_ADMIN
22 | value: admin
23 | - name: KEYCLOAK_ADMIN_PASSWORD
24 | value: admin
25 | - name: KC_DB
26 | value: postgres
27 | - name: KC_DB_URL
28 | value: jdbc:postgresql://keycloak-postgres:5432/keycloak
29 | - name: KC_DB_USERNAME
30 | value: keycloak
31 | - name: KC_DB_PASSWORD
32 | value: "123456"
33 | - name: KC_HTTP_ENABLED
34 | value: "true"
35 | - name: KC_PROXY
36 | value: edge
37 | - name: KC_HOSTNAME_STRICT_HTTPS
38 | value: "false"
39 | - name: KC_HOSTNAME_STRICT
40 | value: "false"
41 | - name: KC_HTTP_RELATIVE_PATH
42 | value: /authn
43 | ports:
44 | - containerPort: 8080
45 | ---
46 | apiVersion: v1
47 | kind: Service
48 | metadata:
49 | name: keycloak
50 | spec:
51 | type: NodePort
52 | selector:
53 | app: keycloak
54 | ports:
55 | - port: 8080
56 | targetPort: 8080
57 | nodePort: 31260 # Can customize between 30000–32767
58 |
--------------------------------------------------------------------------------
/k8s/keycloak_postgres.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: keycloak-db-pvc
5 | spec:
6 | accessModes:
7 | - ReadWriteOnce
8 | resources:
9 | requests:
10 | storage: 1Gi
11 | ---
12 | apiVersion: apps/v1
13 | kind: Deployment
14 | metadata:
15 | name: keycloak-postgres
16 | spec:
17 | replicas: 1
18 | selector:
19 | matchLabels:
20 | app: keycloak-postgres
21 | template:
22 | metadata:
23 | labels:
24 | app: keycloak-postgres
25 | spec:
26 | nodeSelector:
27 | inferx_storage: data
28 | containers:
29 | - name: postgres
30 | image: postgres:14.5
31 | imagePullPolicy: IfNotPresent
32 | env:
33 | - name: POSTGRES_USER
34 | value: keycloak
35 | - name: POSTGRES_PASSWORD
36 | value: "123456"
37 | - name: POSTGRES_DB
38 | value: keycloak
39 | - name: PGDATA
40 | value: /data/postgres
41 | ports:
42 | - containerPort: 5432
43 | volumeMounts:
44 | - name: db-data
45 | mountPath: /data/postgres
46 | volumes:
47 | - name: db-data
48 | hostPath:
49 | path: /opt/inferx/data/postgres_keycloak
50 | type: DirectoryOrCreate
51 | ---
52 | apiVersion: v1
53 | kind: Service
54 | metadata:
55 | name: keycloak-postgres
56 | spec:
57 | selector:
58 | app: keycloak-postgres
59 | ports:
60 | - port: 5432
61 | targetPort: 5432
62 |
--------------------------------------------------------------------------------
/k8s/nvidia-test.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: nvidia-test
5 | spec:
6 | containers:
7 | - name: cuda-container
8 | image: nvidia/cuda:12.2.0-devel-ubuntu20.04
9 | imagePullPolicy: IfNotPresent
10 | command: ["sleep", "infinity"]
11 | resources:
12 | limits:
13 | nvidia.com/gpu: 1
14 | nodeSelector:
15 | kubernetes.io/hostname: brad-ms-7d46
--------------------------------------------------------------------------------
/k8s/scheduler.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: scheduler
5 | labels:
6 | app: scheduler
7 | spec:
8 | replicas: 2
9 | selector:
10 | matchLabels:
11 | app: scheduler
12 | template:
13 | metadata:
14 | labels:
15 | app: scheduler
16 | spec:
17 | hostPID: true
18 | containers:
19 | - name: scheduler
20 | image: inferx/inferx_one:v0.1.1
21 | imagePullPolicy: IfNotPresent
22 | env:
23 | - name: POD_IP
24 | valueFrom:
25 | fieldRef:
26 | fieldPath: status.podIP
27 | - name: RUN_SERVICE
28 | value: "Scheduler"
29 | - name: STATESVC_ADDR
30 | value: "http://statesvc:1237"
31 | volumeMounts:
32 | - mountPath: /opt/inferx/
33 | name: opt-inferx
34 | command: ["./onenode", "/opt/inferx/config/node.json"]
35 | volumes:
36 | - name: opt-inferx
37 | hostPath:
38 | path: /opt/inferx/
39 | ---
40 | apiVersion: v1
41 | kind: Service
42 | metadata:
43 | name: scheduler
44 | spec:
45 | type: NodePort
46 | selector:
47 | app: scheduler
48 | ports:
49 | - name: http
50 | port: 1238
51 | targetPort: 1238
52 |
--------------------------------------------------------------------------------
/k8s/secretdb.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: secret-db-pvc
5 | spec:
6 | accessModes:
7 | - ReadWriteOnce
8 | resources:
9 | requests:
10 | storage: 1Gi
11 | ---
12 | apiVersion: apps/v1
13 | kind: Deployment
14 | metadata:
15 | name: secret-db
16 | spec:
17 | replicas: 1
18 | selector:
19 | matchLabels:
20 | app: secret-db
21 | template:
22 | metadata:
23 | labels:
24 | app: secret-db
25 | spec:
26 | nodeSelector:
27 | inferx_storage: data
28 | containers:
29 | - name: postgres
30 | image: postgres:14.5
31 | imagePullPolicy: IfNotPresent
32 | ports:
33 | - containerPort: 5432
34 | env:
35 | - name: POSTGRES_USER
36 | value: secret
37 | - name: POSTGRES_PASSWORD
38 | value: "123456"
39 | - name: POSTGRES_DB
40 | value: secretdb
41 | - name: PGDATA
42 | value: /data/postgres
43 | volumeMounts:
44 | - name: db-data
45 | mountPath: /data/postgres
46 | - name: init-sql
47 | mountPath: /docker-entrypoint-initdb.d/db.sql
48 | volumes:
49 | - name: db-data
50 | hostPath:
51 | path: /opt/inferx/data/postgres_secret
52 | type: DirectoryOrCreate
53 | - name: init-sql
54 | hostPath:
55 | path: /opt/inferx/config/secret.sql
56 | type: File
57 | ---
58 | apiVersion: v1
59 | kind: Service
60 | metadata:
61 | name: secret-db
62 | spec:
63 | selector:
64 | app: secret-db
65 | ports:
66 | - port: 5432
67 | targetPort: 5432
68 | nodePort: 30541
69 | type: NodePort
70 |
71 |
--------------------------------------------------------------------------------
/k8s/spdk.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: spdk
5 | labels:
6 | app: spdk
7 | spec:
8 | selector:
9 | matchLabels:
10 | app: spdk
11 | template:
12 | metadata:
13 | labels:
14 | app: spdk
15 | spec:
16 | nodeSelector:
17 | inferx_nodeType: inferx_blob
18 | hostNetwork: true
19 | hostPID: true
20 | containers:
21 | - name: spdk
22 | image: inferx/spdk-container2:v0.1.0
23 | imagePullPolicy: IfNotPresent
24 | securityContext:
25 | privileged: true
26 | runAsUser: 0
27 | env:
28 | - name: HUGEMEM
29 | value: "64000"
30 | volumeMounts:
31 | - name: hugepages
32 | mountPath: /dev/hugepages
33 | - name: lib-modules
34 | mountPath: /lib/modules
35 | - name: opt-inferx
36 | mountPath: /opt/inferx
37 | - name: run-udev
38 | mountPath: /run/udev
39 | volumes:
40 | - name: hugepages
41 | hostPath:
42 | path: /dev/hugepages
43 | - name: lib-modules
44 | hostPath:
45 | path: /lib/modules
46 | - name: opt-inferx
47 | hostPath:
48 | path: /opt/inferx
49 | - name: run-udev
50 | hostPath:
51 | path: /run/udev
52 | restartPolicy: Always
53 | tolerations:
54 | - operator: "Exists" # Allow on tainted nodes
55 |
--------------------------------------------------------------------------------
/k8s/statesvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: statesvc
5 | labels:
6 | app: statesvc
7 | spec:
8 | replicas: 2
9 | selector:
10 | matchLabels:
11 | app: statesvc
12 | template:
13 | metadata:
14 | labels:
15 | app: statesvc
16 | spec:
17 | hostPID: true
18 | containers:
19 | - name: statesvc
20 | image: inferx/inferx_one:v0.1.1
21 | imagePullPolicy: IfNotPresent
22 | env:
23 | - name: POD_IP
24 | valueFrom:
25 | fieldRef:
26 | fieldPath: status.podIP
27 | - name: RUN_SERVICE
28 | value: "StateSvc"
29 | - name: CACHE_MEMORY
30 | value: 20Gi
31 | volumeMounts:
32 | - mountPath: /opt/inferx/
33 | name: opt-inferx
34 | command: ["./onenode", "/opt/inferx/config/node.json"]
35 | volumes:
36 | - name: opt-inferx
37 | hostPath:
38 | path: /opt/inferx/
39 | ---
40 | apiVersion: v1
41 | kind: Service
42 | metadata:
43 | name: statesvc
44 | spec:
45 | type: NodePort
46 | selector:
47 | app: statesvc
48 | ports:
49 | - name: http
50 | port: 1237
51 | targetPort: 1237
52 |
--------------------------------------------------------------------------------
/nodeconfig/node.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeName": "node1",
3 | "etcdAddrs": [
4 | "http://etcd:2379"
5 | ],
6 | "hostIpCidr": "192.168.0.0/16",
7 | "podMgrPort": 1233,
8 | "tsotCniPort": 1234,
9 | "tsotSvcPort": 1235,
10 | "qletStateSvcPort": 1236,
11 | "statSvcPort": 1237,
12 | "schedulerPort": 1238,
13 | "gatewayPort": 4000,
14 | "cidr": "10.1.3.0/8",
15 | "stateSvcAddrs": [
16 | "http://localhost:1237"
17 | ],
18 | "tsotSocketPath": "/opt/inferx/sockets/tsot-socket",
19 | "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket",
20 | "runService": true,
21 | "auditdbAddr": "postgresql://audit_user:123456@db:5432/auditdb",
22 | "resources": {
23 | "CPU": 30000,
24 | "Mem": 400000,
25 | "GPUs": "Auto",
26 | "ContextOverhead": 450,
27 | "MaxContextPerGPU": 1
28 | },
29 | "snapshotDir": "/opt/inferx/snapshot",
30 | "enableBlobStore": false,
31 | "sharemem": {
32 | "size": 20,
33 | "hugepage": true
34 | },
35 | "tlsconfig": {
36 | "enable": false,
37 | "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem",
38 | "key": "/etc/letsencrypt/live/inferx.net/privkey.pem"
39 | },
40 | "secretStoreAddr": "postgresql://secret:123456@secret-db:5432/secretdb",
41 | "keycloakconfig": {
42 | "url": "http://keycloak:8080/authn",
43 | "realm": "inferx"
44 | }
45 | }
--------------------------------------------------------------------------------
/nodeconfig/node1.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeName": "node1",
3 | "etcdAddrs": [
4 | "http://localhost:2379"
5 | ],
6 | "hostIpCidr": "192.168.0.0/16",
7 | "podMgrPort": 1233,
8 | "tsotCniPort": 1234,
9 | "tsotSvcPort": 1235,
10 | "qletStateSvcPort": 1236,
11 | "statSvcPort": 1237,
12 | "schedulerPort": 1238,
13 | "gatewayPort": 4000,
14 | "cidr": "10.1.3.0/8",
15 | "stateSvcAddrs": [
16 | "http://localhost:1237"
17 | ],
18 | "tsotSocketPath": "/opt/inferx/sockets/tsot-socket",
19 | "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket",
20 | "runService": true,
21 | "auditdbAddr": "postgresql://audit_user:123456@localhost:5432/auditdb",
22 | "resources": {
23 | "CPU": 30000,
24 | "Mem": 400000,
25 | "GPUType": "A4000",
26 | "GPUs": "Auto",
27 | "ContextOverhead": 450,
28 | "MaxContextPerGPU": 1
29 | },
30 | "snapshotDir": "/opt/inferx/snapshot",
31 | "enableBlobStore": false,
32 | "sharemem": {
33 | "size": 20,
34 | "hugepage": true
35 | },
36 | "tlsconfig": {
37 | "enable": false,
38 | "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem",
39 | "key": "/etc/letsencrypt/live/inferx.net/privkey.pem"
40 | },
41 | "secretStoreAddr": "postgresql://secret:123456@localhost:5431/secretdb",
42 | "keycloakconfig": {
43 | "url": "http://localhost:1260/authn",
44 | "realm": "inferx"
45 | }
46 | }
--------------------------------------------------------------------------------
/nodeconfig/node2.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeName": "node2",
3 | "etcdAddrs": [
4 | "http://localhost:2379"
5 | ],
6 | "hostIpCidr": "192.168.0.0/16",
7 | "podMgrPort": 1233,
8 | "tsotCniPort": 1234,
9 | "tsotSvcPort": 1235,
10 | "qletStateSvcPort": 1236,
11 | "statSvcPort": 1237,
12 | "schedulerPort": 1238,
13 | "gatewayPort": 4000,
14 | "cidr": "10.1.2.0/8",
15 | "stateSvcAddrs": [
16 | "http://localhost:1237"
17 | ],
18 | "tsotSocketPath": "/var/run/quark/tsot-socket",
19 | "tsotGwSocketPath": "/var/run/quark_host/tsot-socket",
20 | "runService": false,
21 | "auditdbAddr": "postgresql://audit_user:123456@localhost/auditdb",
22 | "resources": {
23 | "CPU": 30000,
24 | "Mem": 300000,
25 | "GPUType": "A4000",
26 | "GPUs": "Auto",
27 | "ContextOverhead": 450,
28 | "MaxContextPerGPU": 2
29 | },
30 | "snapshotDir": "/snapshot",
31 | "enableBlobStore": true
32 | }
--------------------------------------------------------------------------------
/nodeconfig/node3.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeName": "node2",
3 | "etcdAddrs": [
4 | "http://localhost:2379"
5 | ],
6 | "hostIpCidr": "192.168.0.0/16",
7 | "podMgrPort": 1233,
8 | "tsotCniPort": 1234,
9 | "tsotSvcPort": 1235,
10 | "qletStateSvcPort": 1236,
11 | "statSvcPort": 1237,
12 | "schedulerPort": 1238,
13 | "gatewayPort": 4000,
14 | "cidr": "10.1.2.0/8",
15 | "stateSvcAddrs": [
16 | "http://localhost:1237"
17 | ],
18 | "tsotSocketPath": "/opt/inferx/sockets/tsot-socket",
19 | "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket",
20 | "runService": true,
21 | "auditdbAddr": "postgresql://audit_user:123456@localhost:5432/auditdb",
22 | "resources": {
23 | "CPU": 30000,
24 | "Mem": 400000,
25 | "GPUType": "A4000",
26 | "GPUs": "Auto",
27 | "ContextOverhead": 440,
28 | "MaxContextPerGPU": 1
29 | },
30 | "snapshotDir": "/opt/inferx/snapshot",
31 | "enableBlobStore": true,
32 | "sharemem": {
33 | "size": 36,
34 | "hugepage": true
35 | },
36 | "tlsconfig": {
37 | "enable": false,
38 | "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem",
39 | "key": "/etc/letsencrypt/live/inferx.net/privkey.pem"
40 | },
41 | "secretStoreAddr": "postgresql://secret:123456@localhost:5431/secretdb",
42 | "keycloakconfig": {
43 | "url": "http://localhost:1260/authn",
44 | "realm": "inferx"
45 | }
46 | }
--------------------------------------------------------------------------------
/nodeconfig/node4.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeName": "node3",
3 | "etcdAddrs": [
4 | "http://localhost:2379"
5 | ],
6 | "hostIpCidr": "192.168.0.0/16",
7 | "podMgrPort": 1233,
8 | "tsotCniPort": 1234,
9 | "tsotSvcPort": 1235,
10 | "qletStateSvcPort": 1236,
11 | "statSvcPort": 1237,
12 | "schedulerPort": 1238,
13 | "gatewayPort": 4000,
14 | "cidr": "10.1.2.0/8",
15 | "stateSvcAddrs": [
16 | "http://localhost:1237"
17 | ],
18 | "tsotSocketPath": "/opt/inferx/sockets/tsot-socket",
19 | "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket",
20 | "runService": true,
21 | "auditdbAddr": "postgresql://audit_user:123456@localhost:30542/auditdb",
22 | "resources": {
23 | "CPU": 30000,
24 | "Mem": 400000,
25 | "GPUType": "A4000",
26 | "GPUs": "Auto",
27 | "ContextOverhead": 450,
28 | "MaxContextPerGPU": 2
29 | },
30 | "snapshotDir": "/opt/inferx/snapshot",
31 | "enableBlobStore": true,
32 | "sharemem": {
33 | "size": 36,
34 | "hugepage": true
35 | },
36 | "tlsconfig": {
37 | "enable": false,
38 | "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem",
39 | "key": "/etc/letsencrypt/live/inferx.net/privkey.pem"
40 | },
41 | "secretStoreAddr": "postgresql://secret:123456@localhost:30541/secretdb",
42 | "keycloakconfig": {
43 | "url": "http://localhost:31260",
44 | "realm": "inferx",
45 | "adminUser": "admin",
46 | "adminPassword": "admin"
47 | }
48 | }
--------------------------------------------------------------------------------
/nodeconfig/node_blob.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeName": "node1",
3 | "etcdAddrs": [
4 | "http://etcd:2379"
5 | ],
6 | "hostIpCidr": "192.168.0.0/16",
7 | "podMgrPort": 1233,
8 | "tsotCniPort": 1234,
9 | "tsotSvcPort": 1235,
10 | "qletStateSvcPort": 1236,
11 | "statSvcPort": 1237,
12 | "schedulerPort": 1238,
13 | "gatewayPort": 4000,
14 | "cidr": "10.1.3.0/8",
15 | "stateSvcAddrs": [
16 | "http://localhost:1237"
17 | ],
18 | "tsotSocketPath": "/opt/inferx/sockets/tsot-socket",
19 | "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket",
20 | "runService": true,
21 | "auditdbAddr": "postgresql://audit_user:123456@db:5432/auditdb",
22 | "resources": {
23 | "CPU": 30000,
24 | "Mem": 400000,
25 | "GPUType": "A4000",
26 | "GPUs": "Auto",
27 | "ContextOverhead": 450,
28 | "MaxContextPerGPU": 1
29 | },
30 | "snapshotDir": "/opt/inferx/snapshot",
31 | "enableBlobStore": true,
32 | "sharemem": {
33 | "size": 50,
34 | "hugepage": true
35 | },
36 | "tlsconfig": {
37 | "enable": false,
38 | "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem",
39 | "key": "/etc/letsencrypt/live/inferx.net/privkey.pem"
40 | },
41 | "secretStoreAddr": "postgresql://secret:123456@secret-db:5432/secretdb",
42 | "keycloakconfig": {
43 | "url": "http://keycloak:8080/authn",
44 | "realm": "inferx"
45 | }
46 | }
--------------------------------------------------------------------------------
/script/inferx_clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PARENT_DIR="/opt/inferx/sandbox/"
4 | INFERX_BIN="/opt/inferx/bin/inferx"
5 |
6 | # pkill -9 inferx
7 |
8 | for SUBDIR in "$PARENT_DIR"/*; do
9 | if [ -d "$SUBDIR" ]; then
10 | SUBFOLDER_NAME=$(basename "$SUBDIR")
11 | echo "Running inferx on: $SUBFOLDER_NAME"
12 | "$INFERX_BIN" \
13 | --root "/var/run/docker/runtime-runc/moby" \
14 | --log-format json \
15 | --systemd-cgroup delete "$SUBFOLDER_NAME"
16 |
17 | fi
18 | done
--------------------------------------------------------------------------------