├── .gitignore
├── Cargo.toml
├── LICENSE
├── Makefile
├── README.md
├── config
    ├── Aquila-7B.json
    ├── BAAI_namespace.json
    ├── Baichuan-7B.json
    ├── Baichuan2-13B-Chat-4bits.json
    ├── Baichuan2-7B-Chat.json
    ├── DeciLM-7B-instruct.json
    ├── DeciLM-7B.json
    ├── Deci_namespace.json
    ├── DeepSeek-R1-Distill-Llama-8B.json
    ├── DeepSeek-R1-Distill-Qwen-1.5B.json
    ├── DeepSeek-R1-Distill-Qwen-7B.json
    ├── EXAONE-3.0-7.8B-Instruct copy.json
    ├── EXAONE-3.0-7.8B-Instruct.json
    ├── EleutherAI_namespace.json
    ├── Llama-2-13b-hf.json
    ├── Llama-3.2-3B-Instruct.json
    ├── Llama-3.2-3B-Instruct_2gpu.json
    ├── Meta-Llama-3-8B-Instruct.json
    ├── Meta-Llama-3-8B.json
    ├── MiniCPM-2B-dpo-bf16.json
    ├── MiniCPM-2B-sft-bf16.json
    ├── MiniCPM3-4B.json
    ├── Minitron-8B-Base.json
    ├── Mistral-7B-Instruct-v0.1.json
    ├── Mistral-7B-v0.1.json
    ├── Mistral-7B-v0.1_2gpu.json
    ├── OLMo-1B-hf.json
    ├── OLMo-1B-hf_2gpu.json
    ├── OLMo-7B-hf.json
    ├── OLMoE-1B-7B-0924-Instruct.json
    ├── OLMoE-1B-7B-0924.json
    ├── OpenAssistant_namespace.json
    ├── Phi-3-mini-128k-instruct.json
    ├── Phi-3-mini-4k-instruct.json
    ├── Qwen-VL-Chat.json
    ├── Qwen.json
    ├── Qwen1.5-MoE-A2.7B.json
    ├── Qwen2.5-1.5B.json
    ├── Qwen2.5-7B-Instruct-1M.json
    ├── Qwen2.5-7B-Instruct-GPTQ-Int8.json
    ├── Qwen2.5-7B.json
    ├── Qwen2.5-Coder-1.5B-Instruct.json
    ├── Qwen2.5-Coder-14B-Instruct-GPTQ-Int8.json
    ├── Qwen2.5-Coder-3B.json
    ├── Qwen2.5-Coder-7B-Instruct.json
    ├── Qwen2.5-Math-1.5B-Instruct.json
    ├── Qwen2.5-Math-1.5B.json
    ├── Qwen2.5-Math-7B-Instruct.json
    ├── Qwen2.5-Math-7B.json
    ├── Qwen7BInt8.json
    ├── Qwen_namespace.json
    ├── Salesforce_namespace.json
    ├── THUDM_namespace.json
    ├── TinyLlama-1.1B-Chat-v1.0.json
    ├── TinyLlama-1.1B-Chat-v1.0_13GB.json
    ├── TinyLlama-1.1B-Chat-v1.0_2gpu.json
    ├── TinyLlama-1.1B-Chat-v1.0_test.json
    ├── TinyLlama_namespace.json
    ├── XVERSE-13B-Chat.json
    ├── XVERSE-7B-Chat.json
    ├── allenai_namespace.json
    ├── baichuan-inc_namespace.json
    ├── bigcode_namespace.json
    ├── chatglm3-6b-128k.json
    ├── chatglm3-6b-32k.json
    ├── chatglm3-6b.json
    ├── codegen-2B-multi.json
    ├── core42_jais-13b-bnb-4bit.json
    ├── core42_jais-13b-chat-bnb-4bit.json
    ├── databricks_namespace.json
    ├── deepseek-ai_namespace.json
    ├── deepseek-llm-7b-chat.json
    ├── deepseek-llm-7b-chat_2gpu.json
    ├── deepseek-math-7b-instruct.json
    ├── deepseek-math-7b-rl.json
    ├── deepseek-vl2-tiny.json
    ├── dolly-v2-12b.json
    ├── facebook_namespace.json
    ├── falcon-7b.json
    ├── falcon-rw-7b.json
    ├── gemma-7b.json
    ├── gpt-j-6b.json
    ├── gpt2-xl.json
    ├── gpt4all-j.json
    ├── internlm2-7b.json
    ├── internlm2_5-7b-chat.json
    ├── llama_8BInt8.json
    ├── llava-1.5-7b-hf.json
    ├── llava-hf_namespace.json
    ├── mamba-1.4b-hf.json
    ├── mamba-2.8b-hf.json
    ├── meta-llama_namespace.json
    ├── microsoft_namespace.json
    ├── mistral.json
    ├── mistralai_namespace.json
    ├── models.txt
    ├── mosaicml_namespace.json
    ├── mpt-7b-storywriter.json
    ├── mpt-7b.json
    ├── namespace1.json
    ├── nomic-ai_namespace.json
    ├── ns1_namespace.json
    ├── oasst-sft-4-pythia-12b-epoch-3.5.json
    ├── openai-community_namespace.json
    ├── openbmb_namespace.json
    ├── opt-iml-max-1.3b.json
    ├── persimmon-8b-base.json
    ├── persimmon-8b-chat.json
    ├── public.json
    ├── pythia-12b.json
    ├── reader.json
    ├── stabilityai_namespace.json
    ├── stable-diffusion-xl-base-1.0.json
    ├── stablelm-3b-4e1t.json
    ├── stablelm-tuned-alpha-7b.json
    ├── starcoder2-3b.json
    ├── starcoder2-7b.json
    ├── state-spaces_namespace.json
    ├── tenant1.json
    └── tiiuae_namespace.json
├── dashboard
    ├── Makefile
    ├── __pycache__
    │   ├── na_pb2.cpython-38.pyc
    │   ├── na_pb2_grpc.cpython-38.pyc
    │   ├── qobjs_pb2.cpython-38.pyc
    │   └── qobjs_pb2_grpc.cpython-38.pyc
    ├── app.py
    ├── client.py
    ├── doc
    ├── gunicorn.conf.py
    ├── na_pb2.py
    ├── na_pb2_grpc.py
    ├── nginx.conf
    ├── qobjs_pb2.py
    ├── qobjs_pb2_grpc.py
    ├── requirements.txt
    ├── sql
    │   ├── audit.sql
    │   ├── create_table.sql
    │   ├── kv.sql
    │   └── secret.sql
    ├── static
    │   └── button.gif
    └── templates
    │   ├── admin.html
    │   ├── base.html
    │   ├── func.html
    │   ├── func_list.html
    │   ├── index.html
    │   ├── log.html
    │   ├── markdown.html
    │   ├── node.html
    │   ├── node_list.html
    │   ├── pod.html
    │   ├── pod_list.html
    │   └── snapshot_list.html
├── deployment
    ├── dashboard.Dockerfile
    ├── llava.Dockerfile
    ├── one.Dockerfile
    ├── spdk.Dockerfile
    ├── spdk.script
    ├── spdk2.Dockerfile
    └── vllm-opai.Dockerfile
├── doc
    ├── GPUSnapshot.png
    ├── architect.png
    ├── comparison.png
    ├── daemon.json
    ├── home.md
    ├── infer_Profile.png
    ├── keycloak.md
    ├── logo.png
    ├── logo1.png
    └── logo2.png
├── docker-compose.yml
├── docker-compose_blob.yml
├── inferx-realm.json
├── inferxlib
    ├── Cargo.toml
    └── src
    │   ├── common.rs
    │   ├── data_obj.rs
    │   ├── lib.rs
    │   ├── node.rs
    │   ├── obj_mgr
    │       ├── cidrlock.rs
    │       ├── func_mgr.rs
    │       ├── funcsnapshot_mgr.rs
    │       ├── mod.rs
    │       ├── namespace_mgr.rs
    │       ├── node_mgr.rs
    │       ├── pod_mgr.rs
    │       └── tenant_mgr.rs
    │   ├── resource.rs
    │   ├── selector.rs
    │   └── validation.rs
├── ixctl
    ├── command.rs
    ├── create.rs
    ├── delete.rs
    ├── get.rs
    ├── list.rs
    ├── main.rs
    ├── object_client.rs
    └── update.rs
├── ixctl_logging_config.yaml
├── k8s
    ├── clean-k3sagent.sh
    ├── cleanup-k3s.sh
    ├── dashboard.yaml
    ├── db-deployment.yaml
    ├── etcd.yaml
    ├── inferx_one.yaml
    ├── inferx_one_blob.yaml
    ├── ingress.yaml
    ├── install-k3s.sh
    ├── join-k3sagent.sh
    ├── keycloak.yaml
    ├── keycloak_postgres.yaml
    ├── nodeagent.yaml
    ├── nvidia-test.yaml
    ├── scheduler.yaml
    ├── secretdb.yaml
    ├── spdk.yaml
    └── statesvc.yaml
├── nodeconfig
    ├── node.json
    ├── node1.json
    ├── node2.json
    ├── node3.json
    ├── node4.json
    └── node_blob.json
└── script
    ├── inferx_clean.sh
    ├── run_llava.py
    ├── run_model.py
    └── run_stablediffusion.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | debug/
 4 | target/
 5 | 
 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 8 | Cargo.lock
 9 | 
10 | # These are backup files generated by rustfmt
11 | **/*.rs.bk
12 | 
13 | # MSVC Windows builds of rustc generate these, which store debugging information
14 | *.pdb
15 | 
16 | # RustRover
17 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
18 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
19 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
20 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
21 | #.idea/


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "qservice"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | inferxlib = { path = "./inferxlib" }
10 | 
11 | libc = "0.2.94"
12 | tokio = { version = "1.25", features = ["full"] }
13 | tokio-stream = { version = "0.1", features = ["net"] }
14 | tonic = { version = "0.8" }
15 | hostname = "^0.3"
16 | rand = "0.8.5"
17 | serde = { version = "1.0", features = ["derive"] }
18 | serde_json = "1.0"
19 | serde_derive = "1.0"
20 | regex = "1.7.1"
21 | reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
22 | chrono = "0.4.24"
23 | tower = "0.4.13"
24 | k8s-openapi = { version = "0.18.0", features = ["v1_26"] }
25 | simple-logging = "2.0.2"
26 | log = "0.4.17"
27 | log4rs = "1"
28 | const_format = "0.2.30"
29 | local-ip-address = "0.5.1"
30 | once_cell = "1.17.1"
31 | ipnetwork = "0.20.0"
32 | scopeguard = { version = "^1.1.0", default-features = false }
33 | errno = "0.2.4"
34 | nix = "0.23.1"
35 | futures = "0.3"
36 | dns-lookup = "2.0.4"
37 | #clap = "4.5.9"
38 | clap = "2.33.3"
39 | oauth2 = "4.0"
40 | 
41 | axum = "0.7.4"
42 | hyper = { version = "1.3.1", features = ["full"] }
43 | hyper-util = { version = "0.1.3", features = ["full"] }
44 | http-body-util = "0.1"
45 | backtrace = "0.3.74"
46 | 
47 | [dependencies.lazy_static]
48 | version = "1.0"
49 | features = ["spin_no_std"]
50 | 
51 | [dependencies.uuid]
52 | version = "1.3.1"
53 | features = [
54 |     "v4",                # Lets you generate random UUIDs
55 |     "fast-rng",          # Use a faster (but still sufficiently random) RNG
56 |     "macro-diagnostics", # Enable better diagnostics for compile-time UUIDs
57 | ]
58 | 
59 | [[bin]]
60 | name = "ixctl"
61 | path = "ixctl/main.rs"
62 | 


--------------------------------------------------------------------------------
/config/Aquila-7B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "BAAI",
 5 |     "name": "Aquila-7B",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "BAAI/Aquila-7B",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "2000",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 60000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 13000
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ],
32 |                 [
33 |                     "VLLM_CUDART_SO_PATH",
34 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 |                 ]
36 |             ],
37 |             "mounts": [
38 |                 {
39 |                     "hostpath": "/home/brad/cache",
40 |                     "mountpath": "/root/.cache/huggingface"
41 |                 }
42 |             ],
43 |             "endpoint": {
44 |                 "port": 8000,
45 |                 "schema": "Http",
46 |                 "probe": "/health"
47 |             },
48 |             "sample_query": {
49 |                 "apiType": "openai",
50 |                 "prompt": "Here is a recipe for vegan banana bread:",
51 |                 "path": "v1/completions",
52 |                 "body": {
53 |                     "model": "BAAI/Aquila-7B",
54 |                     "max_tokens": "1000",
55 |                     "temperature": "0",
56 |                     "stream": "true"
57 |                 }
58 |             },
59 |             "standby": {
60 |                 "gpu": "Blob",
61 |                 "pageable": "Blob",
62 |                 "pinned": "Blob"
63 |             }
64 |         }
65 |     }
66 | }


--------------------------------------------------------------------------------
/config/BAAI_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "BAAI",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/Baichuan-7B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "baichuan-inc",
 5 |     "name": "Baichuan-7B",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "baichuan-inc/Baichuan-7B",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "1200",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 60000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 13800
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ],
32 |                 [
33 |                     "VLLM_CUDART_SO_PATH",
34 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 |                 ]
36 |             ],
37 |             "mounts": [
38 |                 {
39 |                     "hostpath": "/home/brad/cache",
40 |                     "mountpath": "/root/.cache/huggingface"
41 |                 }
42 |             ],
43 |             "endpoint": {
44 |                 "port": 8000,
45 |                 "schema": "Http",
46 |                 "probe": "/health"
47 |             },
48 |             "sample_query": {
49 |                 "apiType": "openai",
50 |                 "prompt": "Give me a short introduction to large language model.",
51 |                 "path": "v1/completions",
52 |                 "body": {
53 |                     "model": "baichuan-inc/Baichuan-7B",
54 |                     "max_tokens": "1000",
55 |                     "temperature": "0",
56 |                     "stream": "true"
57 |                 }
58 |             },
59 |             "standby": {
60 |                 "gpu": "Blob",
61 |                 "pageable": "Blob",
62 |                 "pinned": "Blob"
63 |             }
64 |         }
65 |     }
66 | }


--------------------------------------------------------------------------------
/config/Baichuan2-13B-Chat-4bits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "baichuan-inc",
 5 |     "name": "Baichuan2-13B-Chat-4bits",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "baichuan-inc/Baichuan2-13B-Chat-4bits",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000",
15 |                 "--trust-remote-code"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 12000,
19 |                 "Mem": 24000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 13800
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ],
31 |                 [
32 |                     "VLLM_CUDART_SO_PATH",
33 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 |                 ]
35 |             ],
36 |             "mounts": [
37 |                 {
38 |                     "hostpath": "/home/brad/cache",
39 |                     "mountpath": "/root/.cache/huggingface"
40 |                 }
41 |             ],
42 |             "endpoint": {
43 |                 "port": 8000,
44 |                 "schema": "Http",
45 |                 "probe": "/health"
46 |             },
47 |             "sample_query": {
48 |                 "apiType": "openai",
49 |                 "prompt": "解释一下'温故而知新'",
50 |                 "path": "v1/completions",
51 |                 "body": {
52 |                     "model": "baichuan-inc/Baichuan2-13B-Chat-4bits",
53 |                     "max_tokens": "1000",
54 |                     "temperature": "0",
55 |                     "stream": "true"
56 |                 }
57 |             },
58 |             "standby": {
59 |                 "gpu": "Blob",
60 |                 "pageable": "Blob",
61 |                 "pinned": "Blob"
62 |             }
63 |         }
64 |     }
65 | }


--------------------------------------------------------------------------------
/config/DeciLM-7B-instruct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Deci",
 5 |     "name": "DeciLM-7B-instruct",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Deci/DeciLM-7B-instruct",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "2000",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 50000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 13000
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ]
32 |             ],
33 |             "mounts": [
34 |                 {
35 |                     "hostpath": "/home/brad/cache",
36 |                     "mountpath": "/root/.cache/huggingface"
37 |                 }
38 |             ],
39 |             "endpoint": {
40 |                 "port": 8000,
41 |                 "schema": "Http",
42 |                 "probe": "/health"
43 |             },
44 |             "sample_query": {
45 |                 "apiType": "openai",
46 |                 "prompt": "Here is a recipe for vegan banana bread:",
47 |                 "path": "v1/completions",
48 |                 "body": {
49 |                     "model": "Deci/DeciLM-7B-instruct",
50 |                     "max_tokens": "1000",
51 |                     "temperature": "0",
52 |                     "stream": "true"
53 |                 }
54 |             },
55 |             "standby": {
56 |                 "gpu": "Blob",
57 |                 "pageable": "Blob",
58 |                 "pinned": "Blob"
59 |             }
60 |         }
61 |     }
62 | }


--------------------------------------------------------------------------------
/config/DeciLM-7B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Deci",
 5 |     "name": "DeciLM-7B",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Deci/DeciLM-7B",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "1200",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 50000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 13000
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ]
32 |             ],
33 |             "mounts": [
34 |                 {
35 |                     "hostpath": "/home/brad/cache",
36 |                     "mountpath": "/root/.cache/huggingface"
37 |                 }
38 |             ],
39 |             "endpoint": {
40 |                 "port": 8000,
41 |                 "schema": "Http",
42 |                 "probe": "/health"
43 |             },
44 |             "sample_query": {
45 |                 "apiType": "openai",
46 |                 "prompt": "Here is a recipe for vegan banana bread:",
47 |                 "path": "v1/completions",
48 |                 "body": {
49 |                     "model": "Deci/DeciLM-7B",
50 |                     "max_tokens": "1000",
51 |                     "temperature": "0",
52 |                     "stream": "true"
53 |                 }
54 |             },
55 |             "standby": {
56 |                 "gpu": "Blob",
57 |                 "pageable": "Blob",
58 |                 "pinned": "Blob"
59 |             }
60 |         }
61 |     }
62 | }


--------------------------------------------------------------------------------
/config/Deci_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "Deci",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/EXAONE-3.0-7.8B-Instruct copy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "gemma-7b",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "google/gemma-7b",
11 |             "--enforce-eager",
12 |             "--disable-custom-all-reduce",
13 |             "--trust-remote-code",
14 |             "--max-model-len",
15 |             "2000",
16 |             "--tensor-parallel-size=2"
17 |         ],
18 |         "resources": {
19 |             "CPU": 6000,
20 |             "Mem": 50000,
21 |             "GPU": {
22 |                 "Type": "Any",
23 |                 "Count": 2,
24 |                 "vRam": 15000
25 |             }
26 |         },
27 |         "envs": [
28 |             [
29 |                 "LD_LIBRARY_PATH",
30 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |             ]
32 |         ],
33 |         "mounts": [
34 |             {
35 |                 "hostpath": "/home/brad/cache",
36 |                 "mountpath": "/root/.cache/huggingface"
37 |             }
38 |         ],
39 |         "endpoint": {
40 |             "path": "/v1/completions",
41 |             "port": 8000,
42 |             "schema": "Http"
43 |         },
44 |         "probe": {
45 |             "path": "/health",
46 |             "port": 8000,
47 |             "schema": "Http"
48 |         },
49 |         "api_type": {
50 |             "openai": {
51 |                 "name": "google/gemma-7b",
52 |                 "max_tokens": 1000,
53 |                 "temperature": 0
54 |             }
55 |         },
56 |         "keepalive": "Blob"
57 |     }
58 | }


--------------------------------------------------------------------------------
/config/EXAONE-3.0-7.8B-Instruct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "EXAONE-3.0-7.8B-Instruct",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
11 |             "--enforce-eager",
12 |             "--disable-custom-all-reduce",
13 |             "--trust-remote-code",
14 |             "--max-model-len",
15 |             "2000",
16 |             "--tensor-parallel-size=2"
17 |         ],
18 |         "resources": {
19 |             "CPU": 6000,
20 |             "Mem": 50000,
21 |             "GPU": {
22 |                 "Type": "Any",
23 |                 "Count": 2,
24 |                 "vRam": 15000
25 |             }
26 |         },
27 |         "envs": [
28 |             [
29 |                 "LD_LIBRARY_PATH",
30 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |             ]
32 |         ],
33 |         "mounts": [
34 |             {
35 |                 "hostpath": "/home/brad/cache",
36 |                 "mountpath": "/root/.cache/huggingface"
37 |             }
38 |         ],
39 |         "endpoint": {
40 |             "path": "/v1/completions",
41 |             "port": 8000,
42 |             "schema": "Http"
43 |         },
44 |         "probe": {
45 |             "path": "/health",
46 |             "port": 8000,
47 |             "schema": "Http"
48 |         },
49 |         "api_type": {
50 |             "openai": {
51 |                 "name": "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
52 |                 "max_tokens": 1000,
53 |                 "temperature": 0
54 |             }
55 |         },
56 |         "keepalive": "Blob"
57 |     }
58 | }


--------------------------------------------------------------------------------
/config/EleutherAI_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "EleutherAI",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/Llama-3.2-3B-Instruct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "meta-llama",
 5 |     "name": "Llama-3.2-3B-Instruct",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "meta-llama/Llama-3.2-3B-Instruct",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "200"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 20000,
19 |                 "Mem": 50000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 14600
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ],
31 |                 [
32 |                     "VLLM_CUDART_SO_PATH",
33 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 |                 ]
35 |             ],
36 |             "mounts": [
37 |                 {
38 |                     "hostpath": "/home/brad/cache",
39 |                     "mountpath": "/root/.cache/huggingface"
40 |                 }
41 |             ],
42 |             "endpoint": {
43 |                 "port": 8000,
44 |                 "schema": "Http",
45 |                 "probe": "/health"
46 |             },
47 |             "sample_query": {
48 |                 "apiType": "openai",
49 |                 "prompt": "def print_hello_world():",
50 |                 "path": "v1/completions",
51 |                 "body": {
52 |                     "model": "meta-llama/Llama-3.2-3B-Instruct",
53 |                     "max_tokens": "120",
54 |                     "temperature": "0",
55 |                     "stream": "true"
56 |                 }
57 |             },
58 |             "standby": {
59 |                 "gpu": "Blob",
60 |                 "pageable": "Blob",
61 |                 "pinned": "Blob"
62 |             }
63 |         }
64 |     }
65 | }


--------------------------------------------------------------------------------
/config/Llama-3.2-3B-Instruct_2gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "meta-llama",
 5 |     "name": "Llama-3.2-3B-Instruct_2gpu",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "meta-llama/Llama-3.2-3B-Instruct",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "1000",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 50000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 14600
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ]
32 |             ],
33 |             "mounts": [
34 |                 {
35 |                     "hostpath": "/home/brad/cache",
36 |                     "mountpath": "/root/.cache/huggingface"
37 |                 }
38 |             ],
39 |             "endpoint": {
40 |                 "port": 8000,
41 |                 "schema": "Http",
42 |                 "probe": "/health"
43 |             },
44 |             "sample_query": {
45 |                 "apiType": "openai",
46 |                 "prompt": "def print_hello_world():",
47 |                 "path": "v1/completions",
48 |                 "body": {
49 |                     "model": "meta-llama/Llama-3.2-3B-Instruct",
50 |                     "max_tokens": "120",
51 |                     "temperature": "0",
52 |                     "stream": "true"
53 |                 }
54 |             },
55 |             "standby": {
56 |                 "gpu": "Blob",
57 |                 "pageable": "Blob",
58 |                 "pinned": "Blob"
59 |             }
60 |         }
61 |     }
62 | }


--------------------------------------------------------------------------------
/config/Meta-Llama-3-8B-Instruct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "Meta-Llama-3-8B-Instruct",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "meta-llama/Meta-Llama-3-8B-Instruct",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "meta-llama/Meta-Llama-3-8B-Instruct",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/Meta-Llama-3-8B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "Meta-Llama-3-8B",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "meta-llama/Meta-Llama-3-8B",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "meta-llama/Meta-Llama-3-8B",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/MiniCPM-2B-dpo-bf16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "openbmb",
 5 |     "name": "MiniCPM-2B-dpo-bf16",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "openbmb/MiniCPM-2B-dpo-bf16",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "2000"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 12000,
19 |                 "Mem": 28000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 13800
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ],
31 |                 [
32 |                     "VLLM_CUDART_SO_PATH",
33 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 |                 ]
35 |             ],
36 |             "mounts": [
37 |                 {
38 |                     "hostpath": "/home/brad/cache",
39 |                     "mountpath": "/root/.cache/huggingface"
40 |                 }
41 |             ],
42 |             "endpoint": {
43 |                 "port": 8000,
44 |                 "schema": "Http",
45 |                 "probe": "/health"
46 |             },
47 |             "sample_query": {
48 |                 "apiType": "openai",
49 |                 "prompt": "Give me a short introduction to large language model.",
50 |                 "path": "v1/completions",
51 |                 "body": {
52 |                     "model": "openbmb/MiniCPM-2B-dpo-bf16",
53 |                     "max_tokens": "1000",
54 |                     "temperature": "0",
55 |                     "stream": "true"
56 |                 }
57 |             },
58 |             "standby": {
59 |                 "gpu": "Blob",
60 |                 "pageable": "Blob",
61 |                 "pinned": "Blob"
62 |             }
63 |         }
64 |     }
65 | }


--------------------------------------------------------------------------------
/config/MiniCPM-2B-sft-bf16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "openbmb",
 5 |     "name": "MiniCPM-2B-sft-bf16",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "openbmb/MiniCPM-2B-sft-bf16",
12 |                 "--trust-remote-code",
13 |                 "--max-model-len",
14 |                 "1200"
15 |             ],
16 |             "resources": {
17 |                 "CPU": 12000,
18 |                 "Mem": 24000,
19 |                 "GPU": {
20 |                     "Type": "Any",
21 |                     "Count": 1,
22 |                     "vRam": 9000
23 |                 }
24 |             },
25 |             "envs": [
26 |                 [
27 |                     "LD_LIBRARY_PATH",
28 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 |                 ],
30 |                 [
31 |                     "VLLM_CUDART_SO_PATH",
32 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 |                 ]
34 |             ],
35 |             "mounts": [
36 |                 {
37 |                     "hostpath": "/home/brad/cache",
38 |                     "mountpath": "/root/.cache/huggingface"
39 |                 }
40 |             ],
41 |             "endpoint": {
42 |                 "port": 8000,
43 |                 "schema": "Http",
44 |                 "probe": "/health"
45 |             },
46 |             "sample_query": {
47 |                 "apiType": "openai",
48 |                 "prompt": "Give me a short introduction to large language model.",
49 |                 "path": "v1/completions",
50 |                 "body": {
51 |                     "model": "openbmb/MiniCPM-2B-sft-bf16",
52 |                     "max_tokens": "1000",
53 |                     "temperature": "0",
54 |                     "stream": "true"
55 |                 }
56 |             },
57 |             "standby": {
58 |                 "gpu": "Blob",
59 |                 "pageable": "Blob",
60 |                 "pinned": "Blob"
61 |             }
62 |         }
63 |     }
64 | }


--------------------------------------------------------------------------------
/config/MiniCPM3-4B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "openbmb",
 5 |     "name": "MiniCPM3-4B",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm-openai-upgraded:v.0.1",
 9 |             "commands": [
10 |                 "--model",
11 |                 "openbmb/MiniCPM3-4B",
12 |                 "--enforce-eager",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "200"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 12000,
19 |                 "Mem": 24000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 9000
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ]
31 |             ],
32 |             "mounts": [
33 |                 {
34 |                     "hostpath": "/home/brad/cache",
35 |                     "mountpath": "/root/.cache/huggingface"
36 |                 }
37 |             ],
38 |             "endpoint": {
39 |                 "port": 8000,
40 |                 "schema": "Http",
41 |                 "probe": "/health"
42 |             },
43 |             "sample_query": {
44 |                 "apiType": "openai",
45 |                 "prompt": "推荐5个北京的景点。",
46 |                 "path": "v1/completions",
47 |                 "body": {
48 |                     "model": "openbmb/MiniCPM3-4B",
49 |                     "max_tokens": "100",
50 |                     "temperature": "0",
51 |                     "stream": "true"
52 |                 }
53 |             },
54 |             "standby": {
55 |                 "gpu": "Blob",
56 |                 "pageable": "Blob",
57 |                 "pinned": "Blob"
58 |             }
59 |         }
60 |     }
61 | }


--------------------------------------------------------------------------------
/config/Minitron-8B-Base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "Minitron-8B-Base",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "nvidia/Minitron-8B-Base",
11 |             "--enforce-eager",
12 |             "--disable-custom-all-reduce",
13 |             "--trust-remote-code",
14 |             "--max-model-len",
15 |             "2000",
16 |             "--tensor-parallel-size=2"
17 |         ],
18 |         "resources": {
19 |             "CPU": 6000,
20 |             "Mem": 50000,
21 |             "GPU": {
22 |                 "Type": "Any",
23 |                 "Count": 2,
24 |                 "vRam": 13800
25 |             }
26 |         },
27 |         "envs": [
28 |             [
29 |                 "LD_LIBRARY_PATH",
30 |                 "/Quark/target/debug/:$LD_LIBRARY_PATH"
31 |             ]
32 |         ],
33 |         "mounts": [
34 |             {
35 |                 "hostpath": "/home/brad/cache",
36 |                 "mountpath": "/root/.cache/huggingface"
37 |             }
38 |         ],
39 |         "endpoint": {
40 |             "path": "/v1/completions",
41 |             "port": 8000,
42 |             "schema": "Http"
43 |         },
44 |         "probe": {
45 |             "path": "/health",
46 |             "port": 8000,
47 |             "schema": "Http"
48 |         },
49 |         "api_type": {
50 |             "openai": {
51 |                 "name": "nvidia/Minitron-8B-Base",
52 |                 "max_tokens": 1000,
53 |                 "temperature": 0
54 |             }
55 |         },
56 |         "keepalive": "Blob"
57 |     }
58 | }


--------------------------------------------------------------------------------
/config/Mistral-7B-Instruct-v0.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "Mistral-7B-Instruct-v0.1",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "mistralai/Mistral-7B-Instruct-v0.1",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "mistralai/Mistral-7B-Instruct-v0.1",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/Mistral-7B-v0.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "mistralai",
 5 |     "name": "Mistral-7B-v0.1",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "mistralai/Mistral-7B-v0.1",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--gpu-memory-utilization",
15 |                 "0.99",
16 |                 "--max-model-len",
17 |                 "200"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 20000,
21 |                 "Mem": 30000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 1,
25 |                     "vRam": 14800
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "port": 8000,
42 |                 "schema": "Http",
43 |                 "probe": "/health"
44 |             },
45 |             "sample_query": {
46 |                 "apiType": "openai",
47 |                 "prompt": "I like traveling by train because",
48 |                 "path": "v1/completions",
49 |                 "body": {
50 |                     "model": "mistralai/Mistral-7B-v0.1",
51 |                     "max_tokens": "180",
52 |                     "temperature": "0",
53 |                     "stream": "true"
54 |                 }
55 |             },
56 |             "standby": {
57 |                 "gpu": "Blob",
58 |                 "pageable": "Blob",
59 |                 "pinned": "Blob"
60 |             }
61 |         }
62 |     }
63 | }


--------------------------------------------------------------------------------
/config/OLMo-1B-hf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "allenai",
 5 |     "name": "OLMo-1B-hf",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "allenai/OLMo-1B-hf",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000"
15 |             ],
16 |             "resources": {
17 |                 "CPU": 12000,
18 |                 "Mem": 50000,
19 |                 "GPU": {
20 |                     "Type": "Any",
21 |                     "Count": 1,
22 |                     "vRam": 14600
23 |                 }
24 |             },
25 |             "envs": [
26 |                 [
27 |                     "LD_LIBRARY_PATH",
28 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 |                 ],
30 |                 [
31 |                     "VLLM_CUDART_SO_PATH",
32 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 |                 ]
34 |             ],
35 |             "mounts": [
36 |                 {
37 |                     "hostpath": "/home/brad/cache",
38 |                     "mountpath": "/root/.cache/huggingface"
39 |                 }
40 |             ],
41 |             "endpoint": {
42 |                 "port": 8000,
43 |                 "schema": "Http",
44 |                 "probe": "/health"
45 |             },
46 |             "sample_query": {
47 |                 "apiType": "openai",
48 |                 "prompt": "What is the capital of USA?",
49 |                 "path": "v1/completions",
50 |                 "body": {
51 |                     "model": "allenai/OLMo-1B-hf",
52 |                     "max_tokens": "1000",
53 |                     "temperature": "0",
54 |                     "stream": "true"
55 |                 }
56 |             },
57 |             "standby": {
58 |                 "gpu": "Blob",
59 |                 "pageable": "Blob",
60 |                 "pinned": "Blob"
61 |             }
62 |         }
63 |     }
64 | }


--------------------------------------------------------------------------------
/config/OLMo-1B-hf_2gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "allenai",
 5 |     "name": "OLMo-1B-hf_2gpu",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "allenai/OLMo-1B-hf",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000",
15 |                 "--tensor-parallel-size=2"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 12000,
19 |                 "Mem": 50000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 2,
23 |                     "vRam": 14600
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ]
31 |             ],
32 |             "mounts": [
33 |                 {
34 |                     "hostpath": "/home/brad/cache",
35 |                     "mountpath": "/root/.cache/huggingface"
36 |                 }
37 |             ],
38 |             "endpoint": {
39 |                 "port": 8000,
40 |                 "schema": "Http",
41 |                 "probe": "/health"
42 |             },
43 |             "sample_query": {
44 |                 "apiType": "openai",
45 |                 "prompt": "What is the capital of USA?",
46 |                 "path": "v1/completions",
47 |                 "body": {
48 |                     "model": "allenai/OLMo-1B-hf",
49 |                     "max_tokens": "1000",
50 |                     "temperature": "0",
51 |                     "stream": "true"
52 |                 }
53 |             },
54 |             "standby": {
55 |                 "gpu": "Blob",
56 |                 "pageable": "Blob",
57 |                 "pinned": "Blob"
58 |             }
59 |         }
60 |     }
61 | }


--------------------------------------------------------------------------------
/config/OLMo-7B-hf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "allenai",
 5 |     "name": "OLMo-7B-hf",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "allenai/OLMo-7B-hf",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000",
15 |                 "--tensor-parallel-size=2"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 20000,
19 |                 "Mem": 70000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 2,
23 |                     "vRam": 13800
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ],
31 |                 [
32 |                     "VLLM_CUDART_SO_PATH",
33 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 |                 ]
35 |             ],
36 |             "mounts": [
37 |                 {
38 |                     "hostpath": "/home/brad/cache",
39 |                     "mountpath": "/root/.cache/huggingface"
40 |                 }
41 |             ],
42 |             "endpoint": {
43 |                 "port": 8000,
44 |                 "schema": "Http",
45 |                 "probe": "/health"
46 |             },
47 |             "sample_query": {
48 |                 "apiType": "openai",
49 |                 "prompt": "What is the capital of USA?",
50 |                 "path": "v1/completions",
51 |                 "body": {
52 |                     "model": "allenai/OLMo-7B-hf",
53 |                     "max_tokens": "1000",
54 |                     "temperature": "0",
55 |                     "stream": "true"
56 |                 }
57 |             },
58 |             "standby": {
59 |                 "gpu": "Blob",
60 |                 "pageable": "Blob",
61 |                 "pinned": "Blob"
62 |             }
63 |         }
64 |     }
65 | }


--------------------------------------------------------------------------------
/config/OLMoE-1B-7B-0924-Instruct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "OLMoE-1B-7B-0924-Instruct",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "allenai/OLMoE-1B-7B-0924-Instruct",
11 |             "--disable-custom-all-reduce",
12 |             "--trust-remote-code",
13 |             "--max-model-len",
14 |             "2000",
15 |             "--tensor-parallel-size=2"
16 |         ],
17 |         "resources": {
18 |             "CPU": 6000,
19 |             "Mem": 50000,
20 |             "GPU": {
21 |                 "Type": "Any",
22 |                 "Count": 2,
23 |                 "vRam": 13800
24 |             }
25 |         },
26 |         "envs": [
27 |             [
28 |                 "LD_LIBRARY_PATH",
29 |                 "/Quark/target/debug/:$LD_LIBRARY_PATH"
30 |             ]
31 |         ],
32 |         "mounts": [
33 |             {
34 |                 "hostpath": "/home/brad/cache",
35 |                 "mountpath": "/root/.cache/huggingface"
36 |             }
37 |         ],
38 |         "endpoint": {
39 |             "path": "/v1/completions",
40 |             "port": 8000,
41 |             "schema": "Http"
42 |         },
43 |         "probe": {
44 |             "path": "/health",
45 |             "port": 8000,
46 |             "schema": "Http"
47 |         },
48 |         "api_type": {
49 |             "openai": {
50 |                 "name": "allenai/OLMoE-1B-7B-0924-Instruct",
51 |                 "max_tokens": 1000,
52 |                 "temperature": 0
53 |             }
54 |         },
55 |         "keepalive": "Blob"
56 |     }
57 | }


--------------------------------------------------------------------------------
/config/OLMoE-1B-7B-0924.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "OLMoE-1B-7B-0924",
 6 |     "spec": {
 7 |         "image": "vllm-openai-upgraded:v.0.1",
 8 |         "commands": [
 9 |             "--model",
10 |             "allenai/OLMoE-1B-7B-0924",
11 |             "--enforce-eager",
12 |             "--disable-custom-all-reduce",
13 |             "--trust-remote-code",
14 |             "--max-model-len",
15 |             "2000",
16 |             "--tensor-parallel-size=2"
17 |         ],
18 |         "resources": {
19 |             "CPU": 6000,
20 |             "Mem": 50000,
21 |             "GPU": {
22 |                 "Type": "Any",
23 |                 "Count": 2,
24 |                 "vRam": 13800
25 |             }
26 |         },
27 |         "envs": [
28 |             [
29 |                 "LD_LIBRARY_PATH",
30 |                 "/Quark/target/debug/:$LD_LIBRARY_PATH"
31 |             ]
32 |         ],
33 |         "mounts": [
34 |             {
35 |                 "hostpath": "/home/brad/cache",
36 |                 "mountpath": "/root/.cache/huggingface"
37 |             }
38 |         ],
39 |         "endpoint": {
40 |             "path": "/v1/completions",
41 |             "port": 8000,
42 |             "schema": "Http"
43 |         },
44 |         "probe": {
45 |             "path": "/health",
46 |             "port": 8000,
47 |             "schema": "Http"
48 |         },
49 |         "api_type": {
50 |             "openai": {
51 |                 "name": "allenai/OLMoE-1B-7B-0924",
52 |                 "max_tokens": 1000,
53 |                 "temperature": 0
54 |             }
55 |         },
56 |         "keepalive": "Blob"
57 |     }
58 | }


--------------------------------------------------------------------------------
/config/OpenAssistant_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "OpenAssistant",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/Phi-3-mini-128k-instruct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "microsoft",
 5 |     "name": "Phi-3-mini-128k-instruct",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "microsoft/Phi-3-mini-128k-instruct",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "2000"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 12000,
19 |                 "Mem": 24000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 13000
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ],
31 |                 [
32 |                     "VLLM_CUDART_SO_PATH",
33 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 |                 ]
35 |             ],
36 |             "mounts": [
37 |                 {
38 |                     "hostpath": "/home/brad/cache",
39 |                     "mountpath": "/root/.cache/huggingface"
40 |                 }
41 |             ],
42 |             "endpoint": {
43 |                 "port": 8000,
44 |                 "schema": "Http",
45 |                 "probe": "/health"
46 |             },
47 |             "sample_query": {
48 |                 "apiType": "openai",
49 |                 "prompt": "How to explain Internet for a medieval knight?",
50 |                 "path": "v1/completions",
51 |                 "body": {
52 |                     "model": "microsoft/Phi-3-mini-128k-instruct",
53 |                     "max_tokens": "1000",
54 |                     "temperature": "0",
55 |                     "stream": "true"
56 |                 }
57 |             },
58 |             "standby": {
59 |                 "gpu": "Blob",
60 |                 "pageable": "Blob",
61 |                 "pinned": "Blob"
62 |             }
63 |         }
64 |     }
65 | }


--------------------------------------------------------------------------------
/config/Phi-3-mini-4k-instruct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "microsoft",
 5 |     "name": "Phi-3-mini-4k-instruct",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "microsoft/Phi-3-mini-4k-instruct",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "2000"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 12000,
19 |                 "Mem": 24000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 13000
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ],
31 |                 [
32 |                     "VLLM_CUDART_SO_PATH",
33 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 |                 ]
35 |             ],
36 |             "mounts": [
37 |                 {
38 |                     "hostpath": "/home/brad/cache",
39 |                     "mountpath": "/root/.cache/huggingface"
40 |                 }
41 |             ],
42 |             "endpoint": {
43 |                 "port": 8000,
44 |                 "schema": "Http",
45 |                 "probe": "/health"
46 |             },
47 |             "sample_query": {
48 |                 "apiType": "openai",
49 |                 "prompt": "Can you provide ways to eat combinations of bananas and dragonfruits?",
50 |                 "path": "v1/completions",
51 |                 "body": {
52 |                     "model": "microsoft/Phi-3-mini-4k-instruct",
53 |                     "max_tokens": "1000",
54 |                     "temperature": "0",
55 |                     "stream": "true"
56 |                 }
57 |             },
58 |             "standby": {
59 |                 "gpu": "Blob",
60 |                 "pageable": "Blob",
61 |                 "pinned": "Blob"
62 |             }
63 |         }
64 |     }
65 | }


--------------------------------------------------------------------------------
/config/Qwen-VL-Chat.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "Qwen-VL-Chat",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Qwen/Qwen-VL-Chat",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 70000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "Qwen/Qwen-VL-Chat",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/Qwen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "Qwen",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "Qwen/Qwen2.5-3B-Instruct",
11 |             "--enforce-eager"
12 |         ],
13 |         "resources": {
14 |             "CPU": 100,
15 |             "Mem": 200,
16 |             "GPU": {
17 |                 "Type": "Any",
18 |                 "Usage": {
19 |                     "Partial": 100
20 |                 }
21 |             }
22 |         },
23 |         "envs": [
24 |             [
25 |                 "LD_LIBRARY_PATH",
26 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
27 |             ]
28 |         ],
29 |         "mounts": [
30 |             {
31 |                 "hostpath": "/home/brad/cache",
32 |                 "mountpath": "/root/.cache/huggingface"
33 |             }
34 |         ],
35 |         "endpoint": {
36 |             "path": "/v1/completions",
37 |             "port": 8000,
38 |             "schema": "Http"
39 |         },
40 |         "probe": {
41 |             "path": "/health",
42 |             "port": 8000,
43 |             "schema": "Http"
44 |         },
45 |         "api_type": {
46 |             "openai": {
47 |                 "name": "Qwen/Qwen2.5-3B-Instruct",
48 |                 "max_tokens": 200,
49 |                 "temperature": 0
50 |             }
51 |         }
52 |     }
53 | }


--------------------------------------------------------------------------------
/config/Qwen1.5-MoE-A2.7B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "Qwen1.5-MoE-A2.7B",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "Qwen/Qwen1.5-MoE-A2.7B",
11 |             "--enforce-eager",
12 |             "--disable-custom-all-reduce",
13 |             "--trust-remote-code",
14 |             "--max-model-len",
15 |             "2000",
16 |             "--tensor-parallel-size=2"
17 |         ],
18 |         "resources": {
19 |             "CPU": 6000,
20 |             "Mem": 50000,
21 |             "GPU": {
22 |                 "Type": "Any",
23 |                 "Count": 2,
24 |                 "vRam": 15000
25 |             }
26 |         },
27 |         "envs": [
28 |             [
29 |                 "LD_LIBRARY_PATH",
30 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |             ]
32 |         ],
33 |         "mounts": [
34 |             {
35 |                 "hostpath": "/home/brad/cache",
36 |                 "mountpath": "/root/.cache/huggingface"
37 |             }
38 |         ],
39 |         "endpoint": {
40 |             "path": "/v1/completions",
41 |             "port": 8000,
42 |             "schema": "Http"
43 |         },
44 |         "probe": {
45 |             "path": "/health",
46 |             "port": 8000,
47 |             "schema": "Http"
48 |         },
49 |         "api_type": {
50 |             "openai": {
51 |                 "name": "Qwen/Qwen1.5-MoE-A2.7B",
52 |                 "max_tokens": 1000,
53 |                 "temperature": 0
54 |             }
55 |         },
56 |         "keepalive": "Blob"
57 |     }
58 | }


--------------------------------------------------------------------------------
/config/Qwen2.5-1.5B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Qwen",
 5 |     "name": "Qwen2.5-1.5B",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Qwen/Qwen2.5-1.5B",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000"
15 |             ],
16 |             "resources": {
17 |                 "CPU": 12000,
18 |                 "Mem": 24000,
19 |                 "GPU": {
20 |                     "Type": "Any",
21 |                     "Count": 1,
22 |                     "vRam": 8000
23 |                 }
24 |             },
25 |             "envs": [
26 |                 [
27 |                     "LD_LIBRARY_PATH",
28 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 |                 ],
30 |                 [
31 |                     "VLLM_CUDART_SO_PATH",
32 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 |                 ]
34 |             ],
35 |             "mounts": [
36 |                 {
37 |                     "hostpath": "/home/brad/cache",
38 |                     "mountpath": "/root/.cache/huggingface"
39 |                 }
40 |             ],
41 |             "endpoint": {
42 |                 "port": 8000,
43 |                 "schema": "Http",
44 |                 "probe": "/health"
45 |             },
46 |             "sample_query": {
47 |                 "apiType": "openai",
48 |                 "prompt": "Can you provide ways to eat combinations of bananas and dragonfruits?",
49 |                 "path": "v1/completions",
50 |                 "body": {
51 |                     "model": "Qwen/Qwen2.5-1.5B",
52 |                     "max_tokens": "1000",
53 |                     "temperature": "0",
54 |                     "stream": "true"
55 |                 }
56 |             },
57 |             "standby": {
58 |                 "gpu": "Blob",
59 |                 "pageable": "Blob",
60 |                 "pinned": "Blob"
61 |             }
62 |         }
63 |     }
64 | }


--------------------------------------------------------------------------------
/config/Qwen2.5-7B-Instruct-GPTQ-Int8.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Qwen",
 5 |     "name": "Qwen2.5-7B-Instruct-GPTQ-Int8",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
12 |                 "--gpu-memory-utilization",
13 |                 "0.99",
14 |                 "--max-model-len",
15 |                 "500"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 20000,
19 |                 "Mem": 30000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 14200
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ],
31 |                 [
32 |                     "VLLM_CUDART_SO_PATH",
33 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 |                 ]
35 |             ],
36 |             "mounts": [
37 |                 {
38 |                     "hostpath": "/home/brad/cache",
39 |                     "mountpath": "/root/.cache/huggingface"
40 |                 }
41 |             ],
42 |             "endpoint": {
43 |                 "port": 8000,
44 |                 "schema": "Http",
45 |                 "probe": "/health"
46 |             },
47 |             "sample_query": {
48 |                 "apiType": "openai",
49 |                 "prompt": "Give me a short introduction to large language model.",
50 |                 "path": "v1/completions",
51 |                 "body": {
52 |                     "model": "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
53 |                     "max_tokens": "300",
54 |                     "temperature": "0",
55 |                     "stream": "true"
56 |                 }
57 |             },
58 |             "standby": {
59 |                 "gpu": "Blob",
60 |                 "pageable": "Blob",
61 |                 "pinned": "Blob"
62 |             }
63 |         }
64 |     }
65 | }


--------------------------------------------------------------------------------
/config/Qwen2.5-7B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "models--Qwen--Qwen2.5-7B",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "Qwen/Qwen2.5-7B",
11 |             "--enforce-eager",
12 |             "--max-model-len",
13 |             "2000",
14 |             "--tensor-parallel-size=2"
15 |         ],
16 |         "resources": {
17 |             "CPU": 6000,
18 |             "Mem": 80000,
19 |             "GPU": {
20 |                 "Type": "Any",
21 |                 "Count": 2,
22 |                 "vRam": 14000
23 |             }
24 |         },
25 |         "envs": [
26 |             [
27 |                 "LD_LIBRARY_PATH",
28 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 |             ]
30 |         ],
31 |         "mounts": [
32 |             {
33 |                 "hostpath": "/home/brad/cache",
34 |                 "mountpath": "/root/.cache/huggingface"
35 |             }
36 |         ],
37 |         "endpoint": {
38 |             "path": "/v1/completions",
39 |             "port": 8000,
40 |             "schema": "Http"
41 |         },
42 |         "probe": {
43 |             "path": "/health",
44 |             "port": 8000,
45 |             "schema": "Http"
46 |         },
47 |         "api_type": {
48 |             "openai": {
49 |                 "name": "Qwen/Qwen2.5-7B",
50 |                 "max_tokens": 1000,
51 |                 "temperature": 0
52 |             }
53 |         }
54 |     }
55 | }


--------------------------------------------------------------------------------
/config/Qwen2.5-Coder-1.5B-Instruct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Qwen",
 5 |     "name": "Qwen2.5-Coder-1.5B-Instruct",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Qwen/Qwen2.5-Coder-1.5B-Instruct",
12 |                 "--max-model-len",
13 |                 "1000"
14 |             ],
15 |             "resources": {
16 |                 "CPU": 12000,
17 |                 "Mem": 24000,
18 |                 "GPU": {
19 |                     "Type": "Any",
20 |                     "Count": 1,
21 |                     "vRam": 6000
22 |                 }
23 |             },
24 |             "envs": [
25 |                 [
26 |                     "LD_LIBRARY_PATH",
27 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
28 |                 ],
29 |                 [
30 |                     "VLLM_CUDART_SO_PATH",
31 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "port": 8000,
42 |                 "schema": "Http",
43 |                 "probe": "/health"
44 |             },
45 |             "sample_query": {
46 |                 "apiType": "openai",
47 |                 "prompt": "write a quick sort algorithm.",
48 |                 "path": "v1/completions",
49 |                 "body": {
50 |                     "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
51 |                     "max_tokens": "800",
52 |                     "temperature": "0",
53 |                     "stream": "true"
54 |                 }
55 |             },
56 |             "standby": {
57 |                 "gpu": "Blob",
58 |                 "pageable": "Blob",
59 |                 "pinned": "Blob"
60 |             }
61 |         }
62 |     }
63 | }


--------------------------------------------------------------------------------
/config/Qwen2.5-Coder-3B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Qwen",
 5 |     "name": "Qwen2.5-Coder-3B",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Qwen/Qwen2.5-Coder-3B",
12 |                 "--max-model-len",
13 |                 "1000"
14 |             ],
15 |             "resources": {
16 |                 "CPU": 12000,
17 |                 "Mem": 24000,
18 |                 "GPU": {
19 |                     "Type": "Any",
20 |                     "Count": 1,
21 |                     "vRam": 10000
22 |                 }
23 |             },
24 |             "envs": [
25 |                 [
26 |                     "LD_LIBRARY_PATH",
27 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
28 |                 ],
29 |                 [
30 |                     "VLLM_CUDART_SO_PATH",
31 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "port": 8000,
42 |                 "schema": "Http",
43 |                 "probe": "/health"
44 |             },
45 |             "sample_query": {
46 |                 "apiType": "openai",
47 |                 "prompt": "write a quick sort algorithm.",
48 |                 "path": "v1/completions",
49 |                 "body": {
50 |                     "model": "Qwen/Qwen2.5-Coder-3B",
51 |                     "max_tokens": "800",
52 |                     "temperature": "0",
53 |                     "stream": "true"
54 |                 }
55 |             },
56 |             "standby": {
57 |                 "gpu": "Blob",
58 |                 "pageable": "Blob",
59 |                 "pinned": "Blob"
60 |             }
61 |         }
62 |     }
63 | }


--------------------------------------------------------------------------------
/config/Qwen2.5-Coder-7B-Instruct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Qwen",
 5 |     "name": "Qwen2.5-Coder-7B-Instruct",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Qwen/Qwen2.5-Coder-7B-Instruct",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "2000",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 50000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 13800
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ],
32 |                 [
33 |                     "VLLM_CUDART_SO_PATH",
34 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 |                 ]
36 |             ],
37 |             "mounts": [
38 |                 {
39 |                     "hostpath": "/home/brad/cache",
40 |                     "mountpath": "/root/.cache/huggingface"
41 |                 }
42 |             ],
43 |             "endpoint": {
44 |                 "port": 8000,
45 |                 "schema": "Http",
46 |                 "probe": "/health"
47 |             },
48 |             "sample_query": {
49 |                 "apiType": "openai",
50 |                 "prompt": "write a quick sort algorithm.",
51 |                 "path": "v1/completions",
52 |                 "body": {
53 |                     "model": "Qwen/Qwen2.5-Coder-7B-Instruct",
54 |                     "max_tokens": "1000",
55 |                     "temperature": "0",
56 |                     "stream": "true"
57 |                 }
58 |             },
59 |             "standby": {
60 |                 "gpu": "Blob",
61 |                 "pageable": "Blob",
62 |                 "pinned": "Blob"
63 |             }
64 |         }
65 |     }
66 | }


--------------------------------------------------------------------------------
/config/Qwen2.5-Math-1.5B-Instruct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Qwen",
 5 |     "name": "Qwen2.5-Math-1.5B-Instruct",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Qwen/Qwen2.5-Math-1.5B-Instruct"
12 |             ],
13 |             "resources": {
14 |                 "CPU": 12000,
15 |                 "Mem": 24000,
16 |                 "GPU": {
17 |                     "Type": "Any",
18 |                     "Count": 1,
19 |                     "vRam": 7000
20 |                 }
21 |             },
22 |             "envs": [
23 |                 [
24 |                     "LD_LIBRARY_PATH",
25 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
26 |                 ],
27 |                 [
28 |                     "VLLM_CUDART_SO_PATH",
29 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
30 |                 ]
31 |             ],
32 |             "mounts": [
33 |                 {
34 |                     "hostpath": "/home/brad/cache",
35 |                     "mountpath": "/root/.cache/huggingface"
36 |                 }
37 |             ],
38 |             "endpoint": {
39 |                 "port": 8000,
40 |                 "schema": "Http",
41 |                 "probe": "/health"
42 |             },
43 |             "sample_query": {
44 |                 "apiType": "openai",
45 |                 "prompt": "Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$.",
46 |                 "path": "v1/completions",
47 |                 "body": {
48 |                     "model": "Qwen/Qwen2.5-Math-1.5B-Instruct",
49 |                     "max_tokens": "200",
50 |                     "temperature": "0",
51 |                     "stream": "true"
52 |                 }
53 |             },
54 |             "standby": {
55 |                 "gpu": "Blob",
56 |                 "pageable": "Blob",
57 |                 "pinned": "Blob"
58 |             }
59 |         }
60 |     }
61 | }


--------------------------------------------------------------------------------
/config/Qwen2.5-Math-1.5B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Qwen",
 5 |     "name": "Qwen2.5-Math-1.5B",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Qwen/Qwen2.5-Math-1.5B",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000"
15 |             ],
16 |             "resources": {
17 |                 "CPU": 12000,
18 |                 "Mem": 24000,
19 |                 "GPU": {
20 |                     "Type": "Any",
21 |                     "Count": 1,
22 |                     "vRam": 8000
23 |                 }
24 |             },
25 |             "envs": [
26 |                 [
27 |                     "LD_LIBRARY_PATH",
28 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 |                 ],
30 |                 [
31 |                     "VLLM_CUDART_SO_PATH",
32 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 |                 ]
34 |             ],
35 |             "mounts": [
36 |                 {
37 |                     "hostpath": "/home/brad/cache",
38 |                     "mountpath": "/root/.cache/huggingface"
39 |                 }
40 |             ],
41 |             "endpoint": {
42 |                 "port": 8000,
43 |                 "schema": "Http",
44 |                 "probe": "/health"
45 |             },
46 |             "sample_query": {
47 |                 "apiType": "openai",
48 |                 "prompt": "Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$.",
49 |                 "path": "v1/completions",
50 |                 "body": {
51 |                     "model": "Qwen/Qwen2.5-Math-1.5B",
52 |                     "max_tokens": "1000",
53 |                     "temperature": "0",
54 |                     "stream": "true"
55 |                 }
56 |             },
57 |             "standby": {
58 |                 "gpu": "Blob",
59 |                 "pageable": "Blob",
60 |                 "pinned": "Blob"
61 |             }
62 |         }
63 |     }
64 | }


--------------------------------------------------------------------------------
/config/Qwen2.5-Math-7B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Qwen",
 5 |     "name": "Qwen2.5-Math-7B",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "Qwen/Qwen2.5-Math-7B",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "2000",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 50000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 13800
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ],
32 |                 [
33 |                     "VLLM_CUDART_SO_PATH",
34 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 |                 ]
36 |             ],
37 |             "mounts": [
38 |                 {
39 |                     "hostpath": "/home/brad/cache",
40 |                     "mountpath": "/root/.cache/huggingface"
41 |                 }
42 |             ],
43 |             "endpoint": {
44 |                 "port": 8000,
45 |                 "schema": "Http",
46 |                 "probe": "/health"
47 |             },
48 |             "sample_query": {
49 |                 "apiType": "openai",
50 |                 "prompt": "Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$.",
51 |                 "path": "v1/completions",
52 |                 "body": {
53 |                     "model": "Qwen/Qwen2.5-Math-7B",
54 |                     "max_tokens": "1000",
55 |                     "temperature": "0",
56 |                     "stream": "true"
57 |                 }
58 |             },
59 |             "standby": {
60 |                 "gpu": "Blob",
61 |                 "pageable": "Blob",
62 |                 "pinned": "Blob"
63 |             }
64 |         }
65 |     }
66 | }


--------------------------------------------------------------------------------
/config/Qwen7BInt8.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "Qwen",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
11 |             "--enforce-eager",
12 |             "--gpu-memory-utilization 0.99",
13 |             "--max-model-len 1000"
14 |         ],
15 |         "resources": {
16 |             "CPU": 100,
17 |             "Mem": 200,
18 |             "GPU": {
19 |                 "Type": "RTX3060",
20 |                 "Count": 1
21 |             }
22 |         },
23 |         "envs": [
24 |             [
25 |                 "LD_LIBRARY_PATH",
26 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
27 |             ]
28 |         ],
29 |         "mounts": [
30 |             {
31 |                 "hostpath": "/home/brad/cache",
32 |                 "mountpath": "/root/.cache/huggingface"
33 |             }
34 |         ],
35 |         "endpoint": {
36 |             "path": "/v1/completions",
37 |             "port": 8000,
38 |             "schema": "Http"
39 |         },
40 |         "probe": {
41 |             "path": "/health",
42 |             "port": 8000,
43 |             "schema": "Http"
44 |         },
45 |         "api_type": {
46 |             "openai": {
47 |                 "name": "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
48 |                 "max_tokens": 200,
49 |                 "temperature": 0
50 |             }
51 |         }
52 |     }
53 | }


--------------------------------------------------------------------------------
/config/Qwen_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "Qwen",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/Salesforce_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "Salesforce",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/THUDM_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "THUDM",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/TinyLlama-1.1B-Chat-v1.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "TinyLlama",
 5 |     "name": "TinyLlama-1.1B-Chat-v1.0",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000"
15 |             ],
16 |             "resources": {
17 |                 "CPU": 20000,
18 |                 "Mem": 24000,
19 |                 "GPU": {
20 |                     "Type": "Any",
21 |                     "Count": 1,
22 |                     "vRam": 4800
23 |                 }
24 |             },
25 |             "envs": [
26 |                 [
27 |                     "LD_LIBRARY_PATH",
28 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 |                 ],
30 |                 [
31 |                     "VLLM_CUDART_SO_PATH",
32 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 |                 ]
34 |             ],
35 |             "mounts": [
36 |                 {
37 |                     "hostpath": "/home/brad/cache",
38 |                     "mountpath": "/root/.cache/huggingface"
39 |                 }
40 |             ],
41 |             "endpoint": {
42 |                 "port": 8000,
43 |                 "schema": "Http",
44 |                 "probe": "/health"
45 |             },
46 |             "sample_query": {
47 |                 "apiType": "openai",
48 |                 "prompt": "Seattle is a",
49 |                 "path": "v1/completions",
50 |                 "body": {
51 |                     "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
52 |                     "max_tokens": "1000",
53 |                     "temperature": "0",
54 |                     "stream": "true"
55 |                 }
56 |             },
57 |             "standby": {
58 |                 "gpu": "File",
59 |                 "pageable": "File",
60 |                 "pinned": "File"
61 |             }
62 |         }
63 |     }
64 | }


--------------------------------------------------------------------------------
/config/TinyLlama-1.1B-Chat-v1.0_13GB.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "TinyLlama",
 5 |     "name": "TinyLlama-1.1B-Chat-v1.0_13GB",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000"
15 |             ],
16 |             "resources": {
17 |                 "CPU": 20000,
18 |                 "Mem": 24000,
19 |                 "GPU": {
20 |                     "Type": "Any",
21 |                     "Count": 1,
22 |                     "vRam": 13800
23 |                 }
24 |             },
25 |             "envs": [
26 |                 [
27 |                     "LD_LIBRARY_PATH",
28 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 |                 ],
30 |                 [
31 |                     "VLLM_CUDART_SO_PATH",
32 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 |                 ]
34 |             ],
35 |             "mounts": [
36 |                 {
37 |                     "hostpath": "/home/brad/cache",
38 |                     "mountpath": "/root/.cache/huggingface"
39 |                 }
40 |             ],
41 |             "endpoint": {
42 |                 "port": 8000,
43 |                 "schema": "Http",
44 |                 "probe": "/health"
45 |             },
46 |             "sample_query": {
47 |                 "apiType": "openai",
48 |                 "prompt": "Seattle is a",
49 |                 "path": "v1/completions",
50 |                 "body": {
51 |                     "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
52 |                     "max_tokens": "1000",
53 |                     "temperature": "0",
54 |                     "stream": "true"
55 |                 }
56 |             },
57 |             "standby": {
58 |                 "gpu": "Blob",
59 |                 "pageable": "Blob",
60 |                 "pinned": "Blob"
61 |             }
62 |         }
63 |     }
64 | }


--------------------------------------------------------------------------------
/config/TinyLlama-1.1B-Chat-v1.0_2gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "TinyLlama",
 5 |     "name": "TinyLlama-1.1B-Chat-v1.0_2gpu",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000",
15 |                 "--tensor-parallel-size=2"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 20000,
19 |                 "Mem": 50000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 2,
23 |                     "vRam": 13800
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ]
31 |             ],
32 |             "mounts": [
33 |                 {
34 |                     "hostpath": "/home/brad/cache",
35 |                     "mountpath": "/root/.cache/huggingface"
36 |                 }
37 |             ],
38 |             "endpoint": {
39 |                 "port": 8000,
40 |                 "schema": "Http",
41 |                 "probe": "/health"
42 |             },
43 |             "sample_query": {
44 |                 "apiType": "openai",
45 |                 "prompt": "Seattle is a",
46 |                 "path": "v1/completions",
47 |                 "body": {
48 |                     "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
49 |                     "max_tokens": "1000",
50 |                     "temperature": "0",
51 |                     "stream": "true"
52 |                 }
53 |             },
54 |             "standby": {
55 |                 "gpu": "Blob",
56 |                 "pageable": "Blob",
57 |                 "pinned": "Blob"
58 |             }
59 |         }
60 |     }
61 | }


--------------------------------------------------------------------------------
/config/TinyLlama-1.1B-Chat-v1.0_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "TinyLlama-1.1B-Chat-v1.0_test",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.4.2",
 9 |             "commands": [
10 |                 "--model",
11 |                 "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000"
15 |             ],
16 |             "resources": {
17 |                 "CPU": 20000,
18 |                 "Mem": 18000,
19 |                 "GPU": {
20 |                     "Type": "Any",
21 |                     "Count": 1,
22 |                     "vRam": 4500
23 |                 }
24 |             },
25 |             "envs": [
26 |                 [
27 |                     "LD_LIBRARY_PATH",
28 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 |                 ]
30 |             ],
31 |             "mounts": [
32 |                 {
33 |                     "hostpath": "/home/brad/cache",
34 |                     "mountpath": "/root/.cache/huggingface"
35 |                 }
36 |             ],
37 |             "endpoint": {
38 |                 "port": 8000,
39 |                 "schema": "Http",
40 |                 "probe": "/health"
41 |             },
42 |             "sample_query": {
43 |                 "apiType": "openai",
44 |                 "prompt": "Seattle is a",
45 |                 "path": "v1/completions",
46 |                 "body": {
47 |                     "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
48 |                     "max_tokens": "1000",
49 |                     "temperature": "0",
50 |                     "stream": "true"
51 |                 }
52 |             },
53 |             "standby": {
54 |                 "gpu": "File",
55 |                 "pageable": "File",
56 |                 "pinned": "File"
57 |             }
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/TinyLlama_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "TinyLlama",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/XVERSE-13B-Chat.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "XVERSE-13B-Chat",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "xverse/XVERSE-13B-Chat",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "xverse/XVERSE-13B-Chat",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/XVERSE-7B-Chat.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "XVERSE-7B-Chat",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "xverse/XVERSE-7B-Chat",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "xverse/XVERSE-7B-Chat",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/allenai_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "allenai",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/baichuan-inc_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "baichuan-inc",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/bigcode_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "bigcode",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/chatglm3-6b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "THUDM",
 5 |     "name": "chatglm3-6b",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "THUDM/chatglm3-6b",
12 |                 "--enforce-eager",
13 |                 "--max-model-len",
14 |                 "1500",
15 |                 "--gpu-memory-utilization",
16 |                 "0.99",
17 |                 "--trust-remote-code"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 12000,
21 |                 "Mem": 24000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 1,
25 |                     "vRam": 13800
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ],
33 |                 [
34 |                     "VLLM_CUDART_SO_PATH",
35 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
36 |                 ]
37 |             ],
38 |             "mounts": [
39 |                 {
40 |                     "hostpath": "/home/brad/cache",
41 |                     "mountpath": "/root/.cache/huggingface"
42 |                 }
43 |             ],
44 |             "endpoint": {
45 |                 "port": 8000,
46 |                 "schema": "Http",
47 |                 "probe": "/health"
48 |             },
49 |             "sample_query": {
50 |                 "apiType": "openai",
51 |                 "prompt": "Give me a short introduction to large language model.",
52 |                 "path": "v1/completions",
53 |                 "body": {
54 |                     "model": "THUDM/chatglm3-6b",
55 |                     "max_tokens": "200",
56 |                     "temperature": "0",
57 |                     "stream": "true"
58 |                 }
59 |             },
60 |             "standby": {
61 |                 "gpu": "Blob",
62 |                 "pageable": "Blob",
63 |                 "pinned": "Blob"
64 |             }
65 |         }
66 |     }
67 | }


--------------------------------------------------------------------------------
/config/codegen-2B-multi.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "Salesforce",
 5 |     "name": "codegen-2B-multi",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm-openai-upgraded:v0.1.0",
 9 |             "entrypoint": [
10 |                 "/usr/bin/python3"
11 |             ],
12 |             "commands": [
13 |                 "/usr/lib/run_model.py",
14 |                 "Salesforce/codegen-2B-multi",
15 |                 "200"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 20000,
19 |                 "Mem": 12000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 13000
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ]
31 |             ],
32 |             "mounts": [
33 |                 {
34 |                     "hostpath": "/home/brad/cache",
35 |                     "mountpath": "/root/.cache/huggingface"
36 |                 }
37 |             ],
38 |             "endpoint": {
39 |                 "port": 8000,
40 |                 "schema": "Http",
41 |                 "probe": "/health"
42 |             },
43 |             "sample_query": {
44 |                 "apiType": "standard",
45 |                 "prompt": "def hello_world():",
46 |                 "path": "v1/completions",
47 |                 "body": {
48 |                     "model": "N/A",
49 |                     "max_tokens": "200",
50 |                     "temperature": "0"
51 |                 }
52 |             },
53 |             "standby": {
54 |                 "gpu": "Blob",
55 |                 "pageable": "Blob",
56 |                 "pinned": "Blob"
57 |             }
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/core42_jais-13b-bnb-4bit.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "core42_jais-13b-bnb-4bit",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "jwnder/core42_jais-13b-bnb-4bit",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 6000,
20 |                 "Mem": 20000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 1,
24 |                     "vRam": 15000
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ]
32 |             ],
33 |             "mounts": [
34 |                 {
35 |                     "hostpath": "/home/brad/cache",
36 |                     "mountpath": "/root/.cache/huggingface"
37 |                 }
38 |             ],
39 |             "endpoint": {
40 |                 "path": "/v1/completions",
41 |                 "port": 8000,
42 |                 "schema": "Http"
43 |             },
44 |             "probe": {
45 |                 "path": "/health",
46 |                 "port": 8000,
47 |                 "schema": "Http"
48 |             },
49 |             "api_type": {
50 |                 "openai": {
51 |                     "name": "jwnder/core42_jais-13b-bnb-4bit",
52 |                     "max_tokens": 1000,
53 |                     "temperature": 0
54 |                 }
55 |             },
56 |             "keepalive": "Blob"
57 |         }
58 |     }
59 | }


--------------------------------------------------------------------------------
/config/core42_jais-13b-chat-bnb-4bit.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "core42_jais-13b-chat-bnb-4bit",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "jwnder/core42_jais-13b-chat-bnb-4bit",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 6000,
20 |                 "Mem": 20000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 1,
24 |                     "vRam": 15000
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ]
32 |             ],
33 |             "mounts": [
34 |                 {
35 |                     "hostpath": "/home/brad/cache",
36 |                     "mountpath": "/root/.cache/huggingface"
37 |                 }
38 |             ],
39 |             "endpoint": {
40 |                 "path": "/v1/completions",
41 |                 "port": 8000,
42 |                 "schema": "Http"
43 |             },
44 |             "probe": {
45 |                 "path": "/health",
46 |                 "port": 8000,
47 |                 "schema": "Http"
48 |             },
49 |             "api_type": {
50 |                 "openai": {
51 |                     "name": "jwnder/core42_jais-13b-chat-bnb-4bit",
52 |                     "max_tokens": 1000,
53 |                     "temperature": 0
54 |                 }
55 |             },
56 |             "keepalive": "Blob"
57 |         }
58 |     }
59 | }


--------------------------------------------------------------------------------
/config/databricks_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "databricks",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/deepseek-ai_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "deepseek-ai",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/deepseek-math-7b-rl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "deepseek-ai",
 5 |     "name": "deepseek-math-7b-rl",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm-openai-upgraded:v.0.1",
 9 |             "commands": [
10 |                 "--model",
11 |                 "deepseek-ai/deepseek-math-7b-rl",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 20000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 13000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "port": 8000,
42 |                 "schema": "Http",
43 |                 "probe": "/health"
44 |             },
45 |             "sample_query": {
46 |                 "apiType": "openai",
47 |                 "prompt": "what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \\boxed{}.",
48 |                 "path": "v1/completions",
49 |                 "body": {
50 |                     "model": "deepseek-ai/deepseek-math-7b-rl",
51 |                     "max_tokens": "1000",
52 |                     "temperature": "0",
53 |                     "stream": "true"
54 |                 }
55 |             },
56 |             "standby": {
57 |                 "gpu": "Blob",
58 |                 "pageable": "Blob",
59 |                 "pinned": "Blob"
60 |             }
61 |         }
62 |     }
63 | }


--------------------------------------------------------------------------------
/config/deepseek-vl2-tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "deepseek-ai",
 5 |     "name": "deepseek-vl2-tiny",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm-openai-upgraded:v.0.1",
 9 |             "commands": [
10 |                 "--model",
11 |                 "deepseek-ai/deepseek-vl2-tiny",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 20000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 13000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "port": 8000,
42 |                 "schema": "Http",
43 |                 "probe": "/health"
44 |             },
45 |             "sample_query": {
46 |                 "apiType": "openai",
47 |                 "prompt": "What is the capital of USA?",
48 |                 "path": "v1/completions",
49 |                 "body": {
50 |                     "model": "deepseek-ai/deepseek-vl2-tiny",
51 |                     "max_tokens": "1000",
52 |                     "temperature": "0",
53 |                     "stream": "true"
54 |                 }
55 |             },
56 |             "standby": {
57 |                 "gpu": "Blob",
58 |                 "pageable": "Blob",
59 |                 "pinned": "Blob"
60 |             }
61 |         }
62 |     }
63 | }


--------------------------------------------------------------------------------
/config/facebook_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "facebook",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/falcon-7b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "tiiuae",
 5 |     "name": "falcon-7b",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "tiiuae/falcon-7b",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "tiiuae/falcon-7b",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/falcon-rw-7b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "tiiuae",
 5 |     "name": "falcon-rw-7b",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "tiiuae/falcon-rw-7b",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000",
15 |                 "--tensor-parallel-size=2"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 12000,
19 |                 "Mem": 80000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 2,
23 |                     "vRam": 13800
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ],
31 |                 [
32 |                     "VLLM_CUDART_SO_PATH",
33 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
34 |                 ]
35 |             ],
36 |             "mounts": [
37 |                 {
38 |                     "hostpath": "/home/brad/cache",
39 |                     "mountpath": "/root/.cache/huggingface"
40 |                 }
41 |             ],
42 |             "endpoint": {
43 |                 "port": 8000,
44 |                 "schema": "Http",
45 |                 "probe": "/health"
46 |             },
47 |             "sample_query": {
48 |                 "apiType": "openai",
49 |                 "prompt": "Here is a recipe for vegan banana bread:",
50 |                 "path": "v1/completions",
51 |                 "body": {
52 |                     "model": "tiiuae/falcon-rw-7b",
53 |                     "max_tokens": "1000",
54 |                     "temperature": "0",
55 |                     "stream": "true"
56 |                 }
57 |             },
58 |             "standby": {
59 |                 "gpu": "Blob",
60 |                 "pageable": "Blob",
61 |                 "pinned": "Blob"
62 |             }
63 |         }
64 |     }
65 | }


--------------------------------------------------------------------------------
/config/gemma-7b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "gemma-7b",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "google/gemma-7b",
11 |             "--enforce-eager",
12 |             "--disable-custom-all-reduce",
13 |             "--trust-remote-code",
14 |             "--max-model-len",
15 |             "2000",
16 |             "--tensor-parallel-size=2"
17 |         ],
18 |         "resources": {
19 |             "CPU": 6000,
20 |             "Mem": 50000,
21 |             "GPU": {
22 |                 "Type": "Any",
23 |                 "Count": 2,
24 |                 "vRam": 15000
25 |             }
26 |         },
27 |         "envs": [
28 |             [
29 |                 "LD_LIBRARY_PATH",
30 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |             ]
32 |         ],
33 |         "mounts": [
34 |             {
35 |                 "hostpath": "/home/brad/cache",
36 |                 "mountpath": "/root/.cache/huggingface"
37 |             }
38 |         ],
39 |         "endpoint": {
40 |             "path": "/v1/completions",
41 |             "port": 8000,
42 |             "schema": "Http"
43 |         },
44 |         "probe": {
45 |             "path": "/health",
46 |             "port": 8000,
47 |             "schema": "Http"
48 |         },
49 |         "api_type": {
50 |             "openai": {
51 |                 "name": "google/gemma-7b",
52 |                 "max_tokens": 1000,
53 |                 "temperature": 0
54 |             }
55 |         },
56 |         "keepalive": "Blob"
57 |     }
58 | }


--------------------------------------------------------------------------------
/config/gpt-j-6b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "gpt-j-6b",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "EleutherAI/gpt-j-6b",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 70000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "EleutherAI/gpt-j-6b",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/gpt2-xl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "openai-community",
 5 |     "name": "gpt2-xl",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "openai-community/gpt2-xl",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "800"
15 |             ],
16 |             "resources": {
17 |                 "CPU": 12000,
18 |                 "Mem": 24000,
19 |                 "GPU": {
20 |                     "Type": "Any",
21 |                     "Count": 1,
22 |                     "vRam": 12000
23 |                 }
24 |             },
25 |             "envs": [
26 |                 [
27 |                     "LD_LIBRARY_PATH",
28 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 |                 ],
30 |                 [
31 |                     "VLLM_CUDART_SO_PATH",
32 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 |                 ]
34 |             ],
35 |             "mounts": [
36 |                 {
37 |                     "hostpath": "/home/brad/cache",
38 |                     "mountpath": "/root/.cache/huggingface"
39 |                 }
40 |             ],
41 |             "endpoint": {
42 |                 "port": 8000,
43 |                 "schema": "Http",
44 |                 "probe": "/health"
45 |             },
46 |             "sample_query": {
47 |                 "apiType": "openai",
48 |                 "prompt": "Here is a recipe for vegan banana bread:",
49 |                 "path": "v1/completions",
50 |                 "body": {
51 |                     "model": "openai-community/gpt2-xl",
52 |                     "max_tokens": "600",
53 |                     "temperature": "0",
54 |                     "stream": "true"
55 |                 }
56 |             },
57 |             "standby": {
58 |                 "gpu": "Blob",
59 |                 "pageable": "Blob",
60 |                 "pinned": "Blob"
61 |             }
62 |         }
63 |     }
64 | }


--------------------------------------------------------------------------------
/config/gpt4all-j.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "nomic-ai",
 5 |     "name": "gpt4all-j",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "nomic-ai/gpt4all-j",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "2000",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 60000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 13800
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ],
32 |                 [
33 |                     "VLLM_CUDART_SO_PATH",
34 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 |                 ]
36 |             ],
37 |             "mounts": [
38 |                 {
39 |                     "hostpath": "/home/brad/cache",
40 |                     "mountpath": "/root/.cache/huggingface"
41 |                 }
42 |             ],
43 |             "endpoint": {
44 |                 "port": 8000,
45 |                 "schema": "Http",
46 |                 "probe": "/health"
47 |             },
48 |             "sample_query": {
49 |                 "apiType": "openai",
50 |                 "prompt": "Here is a recipe for vegan banana bread:",
51 |                 "path": "v1/completions",
52 |                 "body": {
53 |                     "model": "nomic-ai/gpt4all-j",
54 |                     "max_tokens": "1000",
55 |                     "temperature": "0",
56 |                     "stream": "true"
57 |                 }
58 |             },
59 |             "standby": {
60 |                 "gpu": "Blob",
61 |                 "pageable": "Blob",
62 |                 "pinned": "Blob"
63 |             }
64 |         }
65 |     }
66 | }


--------------------------------------------------------------------------------
/config/internlm2-7b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "internlm2-7b",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "internlm/internlm2-7b",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "internlm/internlm2-7b",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/internlm2_5-7b-chat.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "internlm2_5-7b-chat",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "internlm/internlm2_5-7b-chat",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "internlm/internlm2_5-7b-chat",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/llama_8BInt8.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "TinyLlama-1.1B-Chat-v1.0_2gpu",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "meta-llama/Llama-Guard-3-8B-INT8",
11 |             "--enforce-eager",
12 |             "--disable-custom-all-reduce",
13 |             "--max-model-len",
14 |             "2000",
15 |             "--tensor-parallel-size=2"
16 |         ],
17 |         "resources": {
18 |             "CPU": 6000,
19 |             "Mem": 50000,
20 |             "GPU": {
21 |                 "Type": "Any",
22 |                 "Count": 2,
23 |                 "vRam": 13800
24 |             }
25 |         },
26 |         "envs": [
27 |             [
28 |                 "LD_LIBRARY_PATH",
29 |                 "/Quark/target/debug/:$LD_LIBRARY_PATH"
30 |             ]
31 |         ],
32 |         "mounts": [
33 |             {
34 |                 "hostpath": "/home/brad/cache",
35 |                 "mountpath": "/root/.cache/huggingface"
36 |             }
37 |         ],
38 |         "endpoint": {
39 |             "path": "/v1/completions",
40 |             "port": 8000,
41 |             "schema": "Http"
42 |         },
43 |         "probe": {
44 |             "path": "/health",
45 |             "port": 8000,
46 |             "schema": "Http"
47 |         },
48 |         "api_type": {
49 |             "openai": {
50 |                 "name": "meta-llama/Llama-Guard-3-8B-INT8",
51 |                 "max_tokens": 1000,
52 |                 "temperature": 0
53 |             }
54 |         },
55 |         "keepalive": "Blob"
56 |     }
57 | }


--------------------------------------------------------------------------------
/config/llava-1.5-7b-hf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "llava-hf",
 5 |     "name": "llava-1.5-7b-hf",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm-openai-upgraded:v0.1.0",
 9 |             "entrypoint": [
10 |                 "/usr/bin/python3"
11 |             ],
12 |             "commands": [
13 |                 "/usr/lib/run_llava.py"
14 |             ],
15 |             "resources": {
16 |                 "CPU": 20000,
17 |                 "Mem": 12000,
18 |                 "GPU": {
19 |                     "Type": "Any",
20 |                     "Count": 1,
21 |                     "vRam": 14000
22 |                 }
23 |             },
24 |             "envs": [
25 |                 [
26 |                     "LD_LIBRARY_PATH",
27 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
28 |                 ]
29 |             ],
30 |             "mounts": [
31 |                 {
32 |                     "hostpath": "/home/brad/cache",
33 |                     "mountpath": "/root/.cache/huggingface"
34 |                 }
35 |             ],
36 |             "endpoint": {
37 |                 "port": 8000,
38 |                 "schema": "Http",
39 |                 "probe": "/health"
40 |             },
41 |             "sample_query": {
42 |                 "apiType": "llava",
43 |                 "prompt": "What is shown in this image?",
44 |                 "path": "v1/completions",
45 |                 "body": {
46 |                     "image": "https://www.ilankelman.org/stopsigns/australia.jpg"
47 |                 }
48 |             },
49 |             "standby": {
50 |                 "gpu": "Blob",
51 |                 "pageable": "Blob",
52 |                 "pinned": "Blob"
53 |             }
54 |         }
55 |     }
56 | }


--------------------------------------------------------------------------------
/config/llava-hf_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "llava-hf",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/mamba-1.4b-hf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "state-spaces",
 5 |     "name": "mamba-1.4b-hf",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "state-spaces/mamba-1.4b-hf",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--max-model-len",
15 |                 "2000"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 12000,
19 |                 "Mem": 50000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 13800
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ]
31 |             ],
32 |             "mounts": [
33 |                 {
34 |                     "hostpath": "/home/brad/cache",
35 |                     "mountpath": "/root/.cache/huggingface"
36 |                 }
37 |             ],
38 |             "endpoint": {
39 |                 "port": 8000,
40 |                 "schema": "Http",
41 |                 "probe": "/health"
42 |             },
43 |             "sample_query": {
44 |                 "apiType": "openai",
45 |                 "prompt": "Hey how are you doing?\n\nI'm doing great.\n\nI",
46 |                 "path": "v1/completions",
47 |                 "body": {
48 |                     "model": "state-spaces/mamba-1.4b-hf",
49 |                     "max_tokens": "1000",
50 |                     "temperature": "0",
51 |                     "stream": "true"
52 |                 }
53 |             },
54 |             "standby": {
55 |                 "gpu": "Blob",
56 |                 "pageable": "Blob",
57 |                 "pinned": "Blob"
58 |             }
59 |         }
60 |     }
61 | }


--------------------------------------------------------------------------------
/config/mamba-2.8b-hf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "state-spaces",
 5 |     "name": "mamba-2.8b-hf",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm-openai-upgraded:v.0.1",
 9 |             "commands": [
10 |                 "--model",
11 |                 "state-spaces/mamba-2.8b-hf",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--max-model-len",
15 |                 "2000"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 12000,
19 |                 "Mem": 50000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 13800
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ]
31 |             ],
32 |             "mounts": [
33 |                 {
34 |                     "hostpath": "/home/brad/cache",
35 |                     "mountpath": "/root/.cache/huggingface"
36 |                 }
37 |             ],
38 |             "endpoint": {
39 |                 "port": 8000,
40 |                 "schema": "Http",
41 |                 "probe": "/health"
42 |             },
43 |             "sample_query": {
44 |                 "apiType": "openai",
45 |                 "prompt": "Hey how are you doing?\n\nI'm doing great.\n\nI",
46 |                 "path": "v1/completions",
47 |                 "body": {
48 |                     "model": "state-spaces/mamba-2.8b-hf",
49 |                     "max_tokens": "1000",
50 |                     "temperature": "0",
51 |                     "stream": "true"
52 |                 }
53 |             },
54 |             "standby": {
55 |                 "gpu": "Blob",
56 |                 "pageable": "Blob",
57 |                 "pinned": "Blob"
58 |             }
59 |         }
60 |     }
61 | }


--------------------------------------------------------------------------------
/config/meta-llama_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "meta-llama",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/microsoft_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "microsoft",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/mistral.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "mistral",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "mistralai/Mistral-7B-v0.1",
11 |             "--enforce-eager"
12 |         ],
13 |         "resources": {
14 |             "CPU": 100,
15 |             "Mem": 200,
16 |             "GPU": {
17 |                 "Type": "RTX3060",
18 |                 "Count": 2
19 |             }
20 |         },
21 |         "envs": [
22 |             [
23 |                 "LD_LIBRARY_PATH",
24 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
25 |             ]
26 |         ],
27 |         "mounts": [
28 |             {
29 |                 "hostpath": "/home/brad/cache",
30 |                 "mountpath": "/root/.cache/huggingface"
31 |             }
32 |         ],
33 |         "endpoint": {
34 |             "path": "/v1/completions",
35 |             "port": 8000,
36 |             "schema": "Http"
37 |         },
38 |         "probe": {
39 |             "path": "/health",
40 |             "port": 8000,
41 |             "schema": "Http"
42 |         },
43 |         "api_type": {
44 |             "openai": {
45 |                 "name": "mistralai/Mistral-7B-v0.1",
46 |                 "max_tokens": 200,
47 |                 "temperature": 0
48 |             }
49 |         }
50 |     }
51 | }


--------------------------------------------------------------------------------
/config/mistralai_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "mistralai",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/mosaicml_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "mosaicml",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/mpt-7b-storywriter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "mosaicml",
 5 |     "name": "mpt-7b-storywriter",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "mosaicml/mpt-7b-storywriter",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "1000",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 50000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 13800
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ],
32 |                 [
33 |                     "VLLM_CUDART_SO_PATH",
34 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 |                 ]
36 |             ],
37 |             "mounts": [
38 |                 {
39 |                     "hostpath": "/home/brad/cache",
40 |                     "mountpath": "/root/.cache/huggingface"
41 |                 }
42 |             ],
43 |             "endpoint": {
44 |                 "port": 8000,
45 |                 "schema": "Http",
46 |                 "probe": "/health"
47 |             },
48 |             "sample_query": {
49 |                 "apiType": "openai",
50 |                 "prompt": "Here is a recipe for vegan banana bread:",
51 |                 "path": "v1/completions",
52 |                 "body": {
53 |                     "model": "mosaicml/mpt-7b-storywriter",
54 |                     "max_tokens": "800",
55 |                     "temperature": "0",
56 |                     "stream": "true"
57 |                 }
58 |             },
59 |             "standby": {
60 |                 "gpu": "Blob",
61 |                 "pageable": "Blob",
62 |                 "pinned": "Blob"
63 |             }
64 |         }
65 |     }
66 | }


--------------------------------------------------------------------------------
/config/mpt-7b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "mosaicml",
 5 |     "name": "mpt-7b",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "mosaicml/mpt-7b",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "1000",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 50000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 13800
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ],
32 |                 [
33 |                     "VLLM_CUDART_SO_PATH",
34 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 |                 ]
36 |             ],
37 |             "mounts": [
38 |                 {
39 |                     "hostpath": "/home/brad/cache",
40 |                     "mountpath": "/root/.cache/huggingface"
41 |                 }
42 |             ],
43 |             "endpoint": {
44 |                 "port": 8000,
45 |                 "schema": "Http",
46 |                 "probe": "/health"
47 |             },
48 |             "sample_query": {
49 |                 "apiType": "openai",
50 |                 "prompt": "Here is a recipe for vegan banana bread:",
51 |                 "path": "v1/completions",
52 |                 "body": {
53 |                     "model": "mosaicml/mpt-7b",
54 |                     "max_tokens": "800",
55 |                     "temperature": "0",
56 |                     "stream": "true"
57 |                 }
58 |             },
59 |             "standby": {
60 |                 "gpu": "Blob",
61 |                 "pageable": "Blob",
62 |                 "pinned": "Blob"
63 |             }
64 |         }
65 |     }
66 | }


--------------------------------------------------------------------------------
/config/namespace1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "t1",
 4 |     "namespace": "system",
 5 |     "name": "ns1",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/nomic-ai_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "nomic-ai",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/ns1_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "t1",
 4 |     "namespace": "system",
 5 |     "name": "ns1",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/openai-community_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "openai-community",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/openbmb_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "openbmb",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/opt-iml-max-1.3b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "facebook",
 5 |     "name": "opt-iml-max-1.3b",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "facebook/opt-iml-max-1.3b",
12 |                 "--max-model-len",
13 |                 "200"
14 |             ],
15 |             "resources": {
16 |                 "CPU": 12000,
17 |                 "Mem": 24000,
18 |                 "GPU": {
19 |                     "Type": "Any",
20 |                     "Count": 1,
21 |                     "vRam": 4500
22 |                 }
23 |             },
24 |             "envs": [
25 |                 [
26 |                     "LD_LIBRARY_PATH",
27 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
28 |                 ],
29 |                 [
30 |                     "VLLM_CUDART_SO_PATH",
31 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "port": 8000,
42 |                 "schema": "Http",
43 |                 "probe": "/health"
44 |             },
45 |             "sample_query": {
46 |                 "apiType": "openai",
47 |                 "prompt": "What is the capital of USA?",
48 |                 "path": "v1/completions",
49 |                 "body": {
50 |                     "model": "facebook/opt-iml-max-1.3b",
51 |                     "max_tokens": "100",
52 |                     "temperature": "0",
53 |                     "stream": "true"
54 |                 }
55 |             },
56 |             "standby": {
57 |                 "gpu": "Mem",
58 |                 "pageable": "File",
59 |                 "pinned": "Mem"
60 |             }
61 |         }
62 |     }
63 | }


--------------------------------------------------------------------------------
/config/persimmon-8b-base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "persimmon-8b-base",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "adept/persimmon-8b-base",
11 |             "--enforce-eager",
12 |             "--disable-custom-all-reduce",
13 |             "--trust-remote-code",
14 |             "--max-model-len",
15 |             "2000"
16 |         ],
17 |         "resources": {
18 |             "CPU": 6000,
19 |             "Mem": 18000,
20 |             "GPU": {
21 |                 "Type": "Any",
22 |                 "Count": 2,
23 |                 "vRam": 15000
24 |             }
25 |         },
26 |         "envs": [
27 |             [
28 |                 "LD_LIBRARY_PATH",
29 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |             ]
31 |         ],
32 |         "mounts": [
33 |             {
34 |                 "hostpath": "/home/brad/cache",
35 |                 "mountpath": "/root/.cache/huggingface"
36 |             }
37 |         ],
38 |         "endpoint": {
39 |             "path": "/v1/completions",
40 |             "port": 8000,
41 |             "schema": "Http"
42 |         },
43 |         "probe": {
44 |             "path": "/health",
45 |             "port": 8000,
46 |             "schema": "Http"
47 |         },
48 |         "api_type": {
49 |             "openai": {
50 |                 "name": "adept/persimmon-8b-base",
51 |                 "max_tokens": 1000,
52 |                 "temperature": 0
53 |             }
54 |         },
55 |         "keepalive": "Blob"
56 |     }
57 | }


--------------------------------------------------------------------------------
/config/persimmon-8b-chat.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "persimmon-8b-chat",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "adept/persimmon-8b-chat",
11 |             "--enforce-eager",
12 |             "--disable-custom-all-reduce",
13 |             "--trust-remote-code",
14 |             "--max-model-len",
15 |             "2000"
16 |         ],
17 |         "resources": {
18 |             "CPU": 6000,
19 |             "Mem": 18000,
20 |             "GPU": {
21 |                 "Type": "Any",
22 |                 "Count": 2,
23 |                 "vRam": 15000
24 |             }
25 |         },
26 |         "envs": [
27 |             [
28 |                 "LD_LIBRARY_PATH",
29 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |             ]
31 |         ],
32 |         "mounts": [
33 |             {
34 |                 "hostpath": "/home/brad/cache",
35 |                 "mountpath": "/root/.cache/huggingface"
36 |             }
37 |         ],
38 |         "endpoint": {
39 |             "path": "/v1/completions",
40 |             "port": 8000,
41 |             "schema": "Http"
42 |         },
43 |         "probe": {
44 |             "path": "/health",
45 |             "port": 8000,
46 |             "schema": "Http"
47 |         },
48 |         "api_type": {
49 |             "openai": {
50 |                 "name": "adept/persimmon-8b-chat",
51 |                 "max_tokens": 1000,
52 |                 "temperature": 0
53 |             }
54 |         },
55 |         "keepalive": "Blob"
56 |     }
57 | }


--------------------------------------------------------------------------------
/config/public.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "tenant",
 3 |     "tenant": "system",
 4 |     "namespace": "system",
 5 |     "name": "public",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/reader.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "reader-lm",
 6 |     "spec": {
 7 |         "image": "vllm/vllm-openai:v0.7.3",
 8 |         "commands": [
 9 |             "--model",
10 |             "jinaai/reader-lm-1.5b",
11 |             "--enforce-eager"
12 |         ],
13 |         "resources": {
14 |             "CPU": 100,
15 |             "Mem": 200,
16 |             "GPU": {
17 |                 "Type": "RTX3060",
18 |                 "Count": 1
19 |             }
20 |         },
21 |         "envs": [
22 |             [
23 |                 "LD_LIBRARY_PATH",
24 |                 "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
25 |             ]
26 |         ],
27 |         "mounts": [
28 |             {
29 |                 "hostpath": "/home/brad/cache",
30 |                 "mountpath": "/root/.cache/huggingface"
31 |             }
32 |         ],
33 |         "endpoint": {
34 |             "path": "/v1/completions",
35 |             "port": 8000,
36 |             "schema": "Http"
37 |         },
38 |         "probe": {
39 |             "path": "/health",
40 |             "port": 8000,
41 |             "schema": "Http"
42 |         },
43 |         "api_type": {
44 |             "openai": {
45 |                 "name": "jinaai/reader-lm-1.5b",
46 |                 "max_tokens": 200,
47 |                 "temperature": 0
48 |             }
49 |         }
50 |     }
51 | }


--------------------------------------------------------------------------------
/config/stabilityai_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "stabilityai",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/stable-diffusion-xl-base-1.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "stabilityai",
 5 |     "name": "stable-diffusion-xl-base-1.0",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm-openai-upgraded:v0.1.0",
 9 |             "entrypoint": [
10 |                 "/usr/bin/python3"
11 |             ],
12 |             "commands": [
13 |                 "/usr/lib/run_stablediffusion.py"
14 |             ],
15 |             "resources": {
16 |                 "CPU": 20000,
17 |                 "Mem": 50000,
18 |                 "GPU": {
19 |                     "Type": "Any",
20 |                     "Count": 1,
21 |                     "vRam": 13800
22 |                 }
23 |             },
24 |             "envs": [
25 |                 [
26 |                     "height",
27 |                     "512"
28 |                 ],
29 |                 [
30 |                     "width",
31 |                     "512"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "port": 8000,
42 |                 "schema": "Http",
43 |                 "probe": "/health"
44 |             },
45 |             "sample_query": {
46 |                 "apiType": "text2img",
47 |                 "prompt": "An astronaut riding a green horse",
48 |                 "path": "funccall",
49 |                 "body": {}
50 |             },
51 |             "standby": {
52 |                 "gpu": "Blob",
53 |                 "pageable": "Blob",
54 |                 "pinned": "Blob"
55 |             }
56 |         }
57 |     }
58 | }


--------------------------------------------------------------------------------
/config/stablelm-3b-4e1t.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "stablelm-3b-4e1t",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "stabilityai/stablelm-3b-4e1t",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--max-model-len",
15 |                 "2000"
16 |             ],
17 |             "resources": {
18 |                 "CPU": 6000,
19 |                 "Mem": 18000,
20 |                 "GPU": {
21 |                     "Type": "Any",
22 |                     "Count": 1,
23 |                     "vRam": 8000
24 |                 }
25 |             },
26 |             "envs": [
27 |                 [
28 |                     "LD_LIBRARY_PATH",
29 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
30 |                 ]
31 |             ],
32 |             "mounts": [
33 |                 {
34 |                     "hostpath": "/home/brad/cache",
35 |                     "mountpath": "/root/.cache/huggingface"
36 |                 }
37 |             ],
38 |             "endpoint": {
39 |                 "path": "/v1/completions",
40 |                 "port": 8000,
41 |                 "schema": "Http"
42 |             },
43 |             "probe": {
44 |                 "path": "/health",
45 |                 "port": 8000,
46 |                 "schema": "Http"
47 |             },
48 |             "api_type": {
49 |                 "openai": {
50 |                     "name": "stabilityai/stablelm-3b-4e1t",
51 |                     "max_tokens": 1000,
52 |                     "temperature": 0
53 |                 }
54 |             },
55 |             "keepalive": "Blob"
56 |         }
57 |     }
58 | }


--------------------------------------------------------------------------------
/config/stablelm-tuned-alpha-7b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "t1",
 4 |     "namespace": "ns1",
 5 |     "name": "stablelm-tuned-alpha-7b",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "stabilityai/stablelm-tuned-alpha-7b",
12 |                 "--enforce-eager",
13 |                 "--disable-custom-all-reduce",
14 |                 "--trust-remote-code",
15 |                 "--max-model-len",
16 |                 "2000",
17 |                 "--tensor-parallel-size=2"
18 |             ],
19 |             "resources": {
20 |                 "CPU": 6000,
21 |                 "Mem": 50000,
22 |                 "GPU": {
23 |                     "Type": "Any",
24 |                     "Count": 2,
25 |                     "vRam": 15000
26 |                 }
27 |             },
28 |             "envs": [
29 |                 [
30 |                     "LD_LIBRARY_PATH",
31 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
32 |                 ]
33 |             ],
34 |             "mounts": [
35 |                 {
36 |                     "hostpath": "/home/brad/cache",
37 |                     "mountpath": "/root/.cache/huggingface"
38 |                 }
39 |             ],
40 |             "endpoint": {
41 |                 "path": "/v1/completions",
42 |                 "port": 8000,
43 |                 "schema": "Http"
44 |             },
45 |             "probe": {
46 |                 "path": "/health",
47 |                 "port": 8000,
48 |                 "schema": "Http"
49 |             },
50 |             "api_type": {
51 |                 "openai": {
52 |                     "name": "stabilityai/stablelm-tuned-alpha-7b",
53 |                     "max_tokens": 1000,
54 |                     "temperature": 0
55 |                 }
56 |             },
57 |             "keepalive": "Blob"
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/config/starcoder2-3b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "bigcode",
 5 |     "name": "starcoder2-3b",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "bigcode/starcoder2-3b",
12 |                 "--disable-custom-all-reduce",
13 |                 "--max-model-len",
14 |                 "2000"
15 |             ],
16 |             "resources": {
17 |                 "CPU": 12000,
18 |                 "Mem": 50000,
19 |                 "GPU": {
20 |                     "Type": "Any",
21 |                     "Count": 1,
22 |                     "vRam": 13800
23 |                 }
24 |             },
25 |             "envs": [
26 |                 [
27 |                     "LD_LIBRARY_PATH",
28 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
29 |                 ],
30 |                 [
31 |                     "VLLM_CUDART_SO_PATH",
32 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
33 |                 ]
34 |             ],
35 |             "mounts": [
36 |                 {
37 |                     "hostpath": "/home/brad/cache",
38 |                     "mountpath": "/root/.cache/huggingface"
39 |                 }
40 |             ],
41 |             "endpoint": {
42 |                 "port": 8000,
43 |                 "schema": "Http",
44 |                 "probe": "/health"
45 |             },
46 |             "sample_query": {
47 |                 "apiType": "openai",
48 |                 "prompt": "def print_hello_world():",
49 |                 "path": "v1/completions",
50 |                 "body": {
51 |                     "model": "bigcode/starcoder2-3b",
52 |                     "max_tokens": "1000",
53 |                     "temperature": "0",
54 |                     "stream": "true"
55 |                 }
56 |             },
57 |             "standby": {
58 |                 "gpu": "Blob",
59 |                 "pageable": "Blob",
60 |                 "pinned": "Blob"
61 |             }
62 |         }
63 |     }
64 | }


--------------------------------------------------------------------------------
/config/starcoder2-7b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "function",
 3 |     "tenant": "public",
 4 |     "namespace": "bigcode",
 5 |     "name": "starcoder2-7b",
 6 |     "object": {
 7 |         "spec": {
 8 |             "image": "vllm/vllm-openai:v0.7.3",
 9 |             "commands": [
10 |                 "--model",
11 |                 "bigcode/starcoder2-7b",
12 |                 "--disable-custom-all-reduce",
13 |                 "--trust-remote-code",
14 |                 "--max-model-len",
15 |                 "2000",
16 |                 "--tensor-parallel-size=2"
17 |             ],
18 |             "resources": {
19 |                 "CPU": 20000,
20 |                 "Mem": 50000,
21 |                 "GPU": {
22 |                     "Type": "Any",
23 |                     "Count": 2,
24 |                     "vRam": 13800
25 |                 }
26 |             },
27 |             "envs": [
28 |                 [
29 |                     "LD_LIBRARY_PATH",
30 |                     "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
31 |                 ],
32 |                 [
33 |                     "VLLM_CUDART_SO_PATH",
34 |                     "/usr/local/cuda-12.1/targets/x86_64-linux/lib/libcudart.so.12"
35 |                 ]
36 |             ],
37 |             "mounts": [
38 |                 {
39 |                     "hostpath": "/home/brad/cache",
40 |                     "mountpath": "/root/.cache/huggingface"
41 |                 }
42 |             ],
43 |             "endpoint": {
44 |                 "port": 8000,
45 |                 "schema": "Http",
46 |                 "probe": "/health"
47 |             },
48 |             "sample_query": {
49 |                 "apiType": "openai",
50 |                 "prompt": "def print_hello_world():",
51 |                 "path": "v1/completions",
52 |                 "body": {
53 |                     "model": "bigcode/starcoder2-7b",
54 |                     "max_tokens": "1000",
55 |                     "temperature": "0",
56 |                     "stream": "true"
57 |                 }
58 |             },
59 |             "standby": {
60 |                 "gpu": "Blob",
61 |                 "pageable": "Blob",
62 |                 "pinned": "Blob"
63 |             }
64 |         }
65 |     }
66 | }


--------------------------------------------------------------------------------
/config/state-spaces_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "state-spaces",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/tenant1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "tenant",
 3 |     "tenant": "system",
 4 |     "namespace": "system",
 5 |     "name": "t1",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/config/tiiuae_namespace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "namespace",
 3 |     "tenant": "public",
 4 |     "namespace": "system",
 5 |     "name": "tiiuae",
 6 |     "object": {
 7 |         "spec": {},
 8 |         "status": {
 9 |             "disable": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/dashboard/Makefile:
--------------------------------------------------------------------------------
 1 | # pip install grpcio grpcio-tools
 2 | # pip install psycopg2-binary 
 3 | all: protoc
 4 | run:
 5 | 	KEYCLOAK_URL=http://192.168.0.22:1260/authn python3 ./app.py
 6 | 
 7 | protoc:
 8 | 	python3 -m grpc_tools.protoc -I ../qshare/proto --python_out=. --grpc_python_out=. qobjs.proto
 9 | 	python3 -m grpc_tools.protoc -I ../qshare/proto --python_out=. --grpc_python_out=. na.proto
10 | 


--------------------------------------------------------------------------------
/dashboard/__pycache__/na_pb2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/na_pb2.cpython-38.pyc


--------------------------------------------------------------------------------
/dashboard/__pycache__/na_pb2_grpc.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/na_pb2_grpc.cpython-38.pyc


--------------------------------------------------------------------------------
/dashboard/__pycache__/qobjs_pb2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/qobjs_pb2.cpython-38.pyc


--------------------------------------------------------------------------------
/dashboard/__pycache__/qobjs_pb2_grpc.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/__pycache__/qobjs_pb2_grpc.cpython-38.pyc


--------------------------------------------------------------------------------
/dashboard/doc:
--------------------------------------------------------------------------------
1 | ../doc


--------------------------------------------------------------------------------
/dashboard/gunicorn.conf.py:
--------------------------------------------------------------------------------
1 | bind = "0.0.0.0:1250"
2 | workers = 4
3 | worker_class = "gevent"
4 | timeout = 30


--------------------------------------------------------------------------------
/dashboard/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiofiles==23.1.0
 2 | async-generator==1.10
 3 | blinker==1.6.2
 4 | exceptiongroup==1.1.1
 5 | Flask==2.3.2
 6 | grpcio==1.54.2
 7 | grpcio-tools==1.54.2
 8 | h2==3.2.0
 9 | hpack==3.0.0
10 | html5tagger==1.3.0
11 | httptools==0.5.0
12 | hyperframe==5.2.0
13 | itsdangerous==2.1.2
14 | janus==1.0.0
15 | Jinja2==3.1.2
16 | MarkupSafe==2.1.3
17 | multidict==6.0.4
18 | numpy==1.24.3
19 | protobuf==4.23.2
20 | purerpc==0.8.0
21 | sanic==23.3.0
22 | sanic-routing==22.8.0
23 | sniffio==1.3.0
24 | tracerite==1.1.0
25 | typing_extensions==4.6.3
26 | ujson==5.7.0
27 | uvloop==0.17.0
28 | websockets==11.0.3
29 | Werkzeug==2.3.6
30 | flask-cors==4.0.1
31 | requests==2.25.1
32 | markdown==3.7
33 | gunicorn==23.0.0
34 | gevent==24.2.1
35 | Authlib==1.3.2
36 | 


--------------------------------------------------------------------------------
/dashboard/sql/kv.sql:
--------------------------------------------------------------------------------
 1 | DROP DATABASE auditdb;
 2 | CREATE DATABASE auditdb;
 3 | 
 4 | \c auditdb;
 5 | 
 6 | DROP TABLE Pod;
 7 | CREATE TABLE Pod (
 8 |     tenant          VARCHAR NOT NULL,
 9 |     namespace       VARCHAR NOT NULL,
10 |     fpname          VARCHAR NOT NULL,
11 |     fprevision      bigint,
12 |     id              VARCHAR NOT NULL,
13 |     nodename        VARCHAR NOT NULL,
14 |     state           VARCHAR NOT NULL,
15 |     updatetime      TIMESTAMP,
16 |     PRIMARY KEY(tenant, namespace, fpname, fprevision, id)
17 | );
18 | 
19 | DROP TABLE PodAudit;
20 | CREATE TABLE PodAudit (
21 |     tenant          VARCHAR NOT NULL,
22 |     namespace       VARCHAR NOT NULL,
23 |     fpname          VARCHAR NOT NULL,
24 |     fprevision      bigint,
25 |     id              VARCHAR NOT NULL,
26 |     nodename        VARCHAR NOT NULL,
27 |     action          VARCHAR NOT NULL,
28 |     state           VARCHAR NOT NULL,
29 |     updatetime      TIMESTAMP,
30 |     PRIMARY KEY(tenant, namespace, fpname, fprevision, id, updatetime)
31 | );
32 | 
33 | DROP TABLE ReqAudit;
34 | CREATE TABLE ReqAudit (
35 |     seqid           SERIAL PRIMARY KEY, 
36 |     podkey          VARCHAR NOT NULL,
37 |     audittime       TIMESTAMP,
38 |     keepalive       bool,
39 |     ttft            int,            -- Time to First Token
40 |     latency         int
41 | );
42 | 
43 | CREATE USER audit_user WITH PASSWORD '123456';
44 | GRANT ALL ON ALL TABLES IN SCHEMA public to audit_user;
45 | GRANT USAGE ON SEQUENCE reqaudit_seqid_seq TO audit_user;
46 | 
47 | -- https://stackoverflow.com/questions/18664074/getting-error-peer-authentication-failed-for-user-postgres-when-trying-to-ge
48 | 
49 | DROP DATABASE testdb;
50 | CREATE DATABASE testdb;
51 | 
52 | \c testdb;
53 | 
54 | DROP TABLE Pod;
55 | CREATE TABLE Pod (
56 |     tenant          VARCHAR NOT NULL
57 | );
58 | 
59 | insert into pod values ('asdf');
60 | 
61 | CREATE OR REPLACE FUNCTION notification_trigger() RETURNS TRIGGER AS 
62 | $$
63 | BEGIN
64 |     PERFORM pg_notify('your_channel_name', 
65 |             to_json(NEW)::TEXT
66 |     );
67 |     RETURN NEW;
68 | END;
69 | $$ LANGUAGE plpgsql;
70 | 
71 | CREATE OR REPLACE TRIGGER capture_change_trigger AFTER INSERT OR UPDATE OR DELETE ON pod
72 | FOR EACH ROW EXECUTE FUNCTION notification_trigger();
73 | 
74 | 


--------------------------------------------------------------------------------
/dashboard/sql/secret.sql:
--------------------------------------------------------------------------------
 1 | --DROP TABLE ApiKey;
 2 | CREATE TABLE Apikey (
 3 |     apikey          VARCHAR NOT NULL,
 4 |     username        VARCHAR NOT NULL,
 5 |     keyname         VARCHAR NOT NULL,
 6 |     createtime      TIMESTAMP,
 7 |     PRIMARY KEY(apikey)
 8 | );
 9 | 
10 | CREATE UNIQUE INDEX apikey_idx_realm_username ON Apikey (username, keyname);
11 | 
12 | CREATE TABLE UserRole (
13 |     username        VARCHAR NOT NULL,
14 |     rolename       VARCHAR NOT NULL,
15 |     PRIMARY KEY(username, rolename)
16 | );
17 | 
18 | CREATE INDEX userrole_idx_rolename ON UserRole (rolename);
19 | 


--------------------------------------------------------------------------------
/dashboard/static/button.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/dashboard/static/button.gif


--------------------------------------------------------------------------------
/dashboard/templates/index.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block content %}
 4 | 
 5 | <h2>func audit</h2>
 6 | 
 7 | <style>
 8 |     table,
 9 |     th,
10 |     td {
11 |         border: 1px solid black;
12 |     }
13 | </style>
14 | 
15 | {% endblock %}


--------------------------------------------------------------------------------
/dashboard/templates/log.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block content %}
 4 |     <h2>func log</h2>
 5 | 
 6 |     <style>
 7 |         table, th, td {
 8 |           border:1px solid black;
 9 |         }
10 |     </style>
11 |     <table style="width:100%">
12 |     <tr>
13 |         <th>namespace</th>
14 |         <th>func name</th>
15 |         <th>func id</th>
16 |     </tr>
17 |     <tr>
18 |         <td>{{ namespace }}</td>
19 |         <td>{{ funcName }}</td>
20 |         <td>{{ funcId }}</td>
21 |     </tr>
22 |     </table>
23 | 
24 |     {{ log | safe  }}
25 | {% endblock %}


--------------------------------------------------------------------------------
/dashboard/templates/markdown.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block content %}
 4 | 
 5 | <body>
 6 |     <style>
 7 |         table,
 8 |         th,
 9 |         td {
10 |             border: 1px solid black;
11 |         }
12 |     </style>
13 |     <table style="width:60%">
14 |         <tr>
15 |             <td>{{ md_content|safe }}</td>
16 |         </tr>
17 |     </table>
18 | </body>
19 | 
20 | 
21 | 
22 | 
23 | {{ log | safe }}
24 | {% endblock %}


--------------------------------------------------------------------------------
/dashboard/templates/node.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block content %}
 4 | <h2>Node</h2>
 5 | 
 6 | <style>
 7 |     table,
 8 |     th,
 9 |     td {
10 |         border: 1px solid black;
11 |     }
12 | </style>
13 | <table style="width:100%">
14 |     <tr>
15 |         <th>Node Name</th>
16 |         <th>Node</th>
17 |     </tr>
18 |     <tr>
19 |         <td>{{ name }}</td>
20 |         {% autoescape false %}
21 |         <td>{{ node }}</td>
22 |         {% endautoescape %}
23 |     </tr>
24 | </table>
25 | 
26 | {{ log | safe }}
27 | {% endblock %}


--------------------------------------------------------------------------------
/dashboard/templates/node_list.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block content %}
 4 | <h2>Nodes</h2>
 5 | <style>
 6 |     table,
 7 |     th,
 8 |     td {
 9 |         border: 1px solid black;
10 |     }
11 | </style>
12 | <table style="width:100%">
13 |     <tr>
14 |         <th>nodename</th>
15 |         <th>Ip Address</th>
16 |         <th>CIDR</th>
17 |         <th>CPU Count</th>
18 |         <th>CPU Memory (GB) </th>
19 |         <th>MaxContextPerGPU </th>
20 |         <th>BlobStore </th>
21 |         <th>GPUs </th>
22 |     </tr>
23 |     {% for node in nodes %}
24 |     <tr>
25 |         <td><a href="{{ hosturl }}node?name={{ node["name"] }}">{{
26 |                 node["name"] }}</a></td>
27 |         {% autoescape false %}
28 |         <td>{{ node['object']['nodeIp'] }}</td>
29 |         <td>{{ node['object']['cidr'] }}</td>
30 |         <td>{{ node['object']['resources']['CPU'] // 1000 }}</td>
31 |         <td>{{ node['object']['resources']['Mem'] // 1000 }}</td>
32 |         <td>{{ node['object']['resources']['MaxContextPerGPU'] }}</td>
33 |         <td>{{ node['object']['blobStoreEnable'] }}</td>
34 |         <td>{{ node['object']['resources']['GPUs'] }}</td>
35 |         {% endautoescape %}
36 |     </tr>
37 |     {% endfor %}
38 | </table>
39 | {{ hosturl }}
40 | {% endblock %}


--------------------------------------------------------------------------------
/dashboard/templates/pod.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block content %}
 4 | 
 5 | <h2>funcpod</h2>
 6 | 
 7 | <style>
 8 |     table,
 9 |     th,
10 |     td {
11 |         border: 1px solid black;
12 |     }
13 | </style>
14 | <table style="width:100%">
15 |     <tr>
16 |         <th>tenant</th>
17 |         <th>namespace</th>
18 |         <th>podname</th>
19 |     </tr>
20 |     <tr>
21 |         <td>{{ tenant }}</td>
22 |         <td>{{ namespace }}</td>
23 |         <td>{{ podname }}</td>
24 |     </tr>
25 | </table>
26 | 
27 | {% if audits %}
28 | <h2>state</h2>
29 | <table style="width:100%">
30 |     <tr>
31 |         <th>state</th>
32 |         <th>time</th>
33 |     </tr>
34 |     {% for audit in audits %}
35 |     <tr>
36 |         <td>{{ audit["state"] }}</td>
37 |         <td>{{ audit["updatetime"] }}</td>
38 |     </tr>
39 |     {% endfor %}
40 | </table>
41 | {% endif %}
42 | <h2>log</h2>
43 | <table style="width:100%">
44 |     <tr>
45 |         {% autoescape false %}
46 |         <td>{{ log }}</td>
47 |         {% endautoescape %}
48 |     </tr>
49 | </table>
50 | 
51 | <!-- <h2>stats</h2>
52 | <table style="width:100%">
53 |     <tr>
54 |         {% autoescape false %}
55 |         <td>{{ audits[1] }}</td>
56 |         {% endautoescape %}
57 |     </tr>
58 | </table> -->
59 | {{ hosturl }}
60 | {% endblock %}


--------------------------------------------------------------------------------
/dashboard/templates/snapshot_list.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block content %}
 4 | <style>
 5 |     table,
 6 |     th,
 7 |     td {
 8 |         border: 1px solid black;
 9 |     }
10 | </style>
11 | 
12 | <h2>Snapshots</h2>
13 | <table style="width:100%">
14 |     <tr>
15 |         <th>Snapshot Id</th>
16 |         <th>nodename</th>
17 |         <th>state</th>
18 |         <th>gpu</th>
19 |         <th>pageable</th>
20 |         <th>pinned</th>
21 |         <th>docker image name</th>
22 |         <th>build id</th>
23 |     </tr>
24 |     {% for snapshot in snapshots %}
25 |     <tr>
26 |         <td>{{ snapshot["name"] }}</td>
27 |         <td>{{ snapshot["object"]['nodename'] }}</td>
28 |         <td>{{ snapshot["object"]['state'] }}</td>
29 |         <td>{{ snapshot["object"]['info']['gpuMemSizes'] }}</td>
30 |         <td>{{ snapshot["object"]['info']['processCheckpointSize'] // (1024*1024) }} MB</td>
31 |         <td>{{ snapshot["object"]['info']['hostMemSize'] // (1024*1024) }} MB </td>
32 |         <td>{{ snapshot["object"]['meta']['imagename'] }}</td>
33 |         <td>{{ snapshot["object"]['meta']['buildId'] }}</td>
34 |     </tr>
35 |     {% endfor %}
36 | </table>
37 | {{ hosturl }}
38 | {% endblock %}


--------------------------------------------------------------------------------
/deployment/dashboard.Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | 
 3 | FROM python:3.10-slim-buster
 4 | 
 5 | WORKDIR /
 6 | 
 7 | RUN apt-get -y update
 8 | RUN apt-get install -y libpq-dev gcc
 9 | RUN apt-get install -y bash
10 | RUN apt-get install -y nginx
11 | RUN apt-get install -y curl
12 | 
13 | COPY requirements.txt requirements.txt
14 | RUN pip3 install -r requirements.txt
15 | 
16 | COPY . .
17 | 
18 | COPY nginx.conf /etc/nginx/sites-available/default
19 | 
20 | CMD service nginx start && gunicorn -w 4 -b 0.0.0.0:1250 app:app
21 | # CMD service nginx start && python3 ./app.py
22 | # CMD python3 ./app.py


--------------------------------------------------------------------------------
/deployment/llava.Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | 
 3 | FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
 4 | #FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04
 5 | WORKDIR /
 6 | RUN apt-get -y update
 7 | RUN apt-get -y install libnuma-dev fuse3 libkeyutils-dev libaio-dev
 8 | 
 9 | COPY onenode_logging_config.yaml /opt/inferx/config/onenode_logging_config.yaml
10 | COPY node.json /opt/inferx/config/node.json
11 | COPY libnvmedrv.so /usr/lib/libnvmedrv.so
12 | COPY . .
13 | CMD ["./onenode"]


--------------------------------------------------------------------------------
/deployment/one.Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | 
 3 | #FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04
 4 | FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
 5 | WORKDIR /
 6 | RUN apt-get -y update
 7 | RUN apt-get -y install libnuma-dev 
 8 | RUN apt-get -y install fuse3 
 9 | RUN apt-get -y install libkeyutils-dev 
10 | RUN apt-get -y install libaio-dev 
11 | # RUN apt-get -y install libssl3
12 | RUN apt-get -y install libssl-dev
13 | 
14 | COPY onenode_logging_config.yaml /opt/inferx/config/onenode_logging_config.yaml
15 | COPY node.json /opt/inferx/config/node.json
16 | COPY libnvmedrv.so /usr/lib/libnvmedrv.so
17 | COPY . .
18 | CMD ["./onenode"]


--------------------------------------------------------------------------------
/deployment/spdk.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use Ubuntu as the base image
 2 | FROM ubuntu:22.04
 3 | 
 4 | # Set environment variables
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | # Install dependencies
 8 | RUN apt-get update && apt-get install -y \
 9 |     build-essential \
10 |     git \
11 |     gcc \
12 |     make \
13 |     libaio-dev \
14 |     libpciaccess-dev \
15 |     python3 \
16 |     python3-pip \
17 |     pciutils \
18 |     pkg-config kmod \
19 |     libjson-c-dev libcunit1-dev libssl-dev libcmocka-dev uuid-dev libiscsi-dev libkeyutils-dev libncurses5-dev libncursesw5-dev unzip libfuse3-dev patchelf \
20 |     python3-configshell-fb python3-pexpect nasm libnuma-dev \
21 |     autoconf automake libtool help2man systemtap-sdt-dev \
22 |     astyle lcov clang sg3-utils shellcheck abigail-tools bash-completion ruby-dev pycodestyle bundler rake python3-paramiko curl \
23 |     libpmem-dev libpmemblk-dev libpmemobj-dev \
24 |     librados-dev librbd-dev libibverbs-dev librdmacm-dev 
25 | 
26 | # Clone the SPDK repository
27 | RUN git clone https://github.com/spdk/spdk.git /spdk --recursive
28 | 
29 | # Set working directory
30 | WORKDIR /spdk
31 | 
32 | RUN ./scripts/pkgdep.sh --all
33 | RUN ./configure
34 | RUN make
35 | 
36 | # Set up entrypoint to provide SPDK CLI tools
37 | ENTRYPOINT scripts/gen_nvme.sh --json-with-subsystems > /opt/inferx/config/nvme_bdev_all.json && scripts/setup.sh
38 | 


--------------------------------------------------------------------------------
/deployment/spdk.script:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e  # exit immediately if any command fails
 3 | 
 4 | echo "Generating NVMe config..."
 5 | scripts/gen_nvme.sh --json-with-subsystems > /opt/inferx/config/nvme_bdev_all.json
 6 | sleep 1
 7 | echo "Running SPDK setup..."
 8 | scripts/setup.sh
 9 | sleep 1
10 | scripts/gen_nvme.sh --json-with-subsystems > /opt/inferx/config/nvme_bdev_all.json
11 | sleep 1
12 | scripts/setup.sh
13 | echo "SPDK setup complete."
14 | 
15 | while true; do sleep 86400; done
16 | 


--------------------------------------------------------------------------------
/deployment/spdk2.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use Ubuntu as the base image
 2 | FROM inferx/spdk-container:v0.1.0
 3 | 
 4 | # Set environment variables
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | COPY entrypoint.sh /spdk/entrypoint.sh
 8 | 
 9 | # Set working directory
10 | WORKDIR /spdk
11 | 
12 | 
13 | # Set up entrypoint to provide SPDK CLI tools
14 | ENTRYPOINT bash /spdk/entrypoint.sh
15 | 


--------------------------------------------------------------------------------
/deployment/vllm-opai.Dockerfile:
--------------------------------------------------------------------------------
 1 | # docker build -t vllm-openai-upgraded .
 2 | FROM vllm/vllm-openai:v0.7.3
 3 | WORKDIR /
 4 | # Upgrade the transformers library
 5 | RUN apt-get -y update
 6 | RUN apt-get install libglib2.0-0 -y
 7 | RUN apt-get install libgl1 -y
 8 | 
 9 | RUN pip install --upgrade transformers
10 | RUN pip install --upgrade safetensors
11 | RUN pip install diffusers --upgrade
12 | RUN pip install invisible_watermark accelerate 
13 | 
14 | COPY run_model.py /usr/lib/run_model.py
15 | COPY run_llava.py /usr/lib/run_llava.py
16 | COPY run_stablediffusion.py /usr/lib/run_stablediffusion.py
17 | 
18 | 


--------------------------------------------------------------------------------
/doc/GPUSnapshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/GPUSnapshot.png


--------------------------------------------------------------------------------
/doc/architect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/architect.png


--------------------------------------------------------------------------------
/doc/comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/comparison.png


--------------------------------------------------------------------------------
/doc/daemon.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "runtimes": {
 3 |         "nvidia": {
 4 |             "args": [],
 5 |             "path": "nvidia-container-runtime"
 6 |         },
 7 |         "inferx": {
 8 |             "path": "/opt/inferx/bin/inferx"
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/doc/home.md:
--------------------------------------------------------------------------------
1 | ../README.md


--------------------------------------------------------------------------------
/doc/infer_Profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/infer_Profile.png


--------------------------------------------------------------------------------
/doc/keycloak.md:
--------------------------------------------------------------------------------
 1 | 1. Create Realm "Inferx"
 2 | 2. Create Client "infer_client" in  Realm "Inferx"
 3 |    a. Enable Client authentication
 4 |    b. Add Valid redirect URI
 5 |        https://inferx.net:8000/*
 6 |        http://<localhost>:1250/*
 7 |        http://<localhost>:81/*
 8 |        http://<localhost>:4000/*
 9 |    c. Add web origins
10 |        https://inferx.net:8000
11 |        http://<localhost>:1250
12 |        http://<localhost>:81
13 |        http://<localhost>:4000
14 |     d. Enable "Direct Access Grants Enabled"
15 | 3. Update KEYCLOAK_CLIENT_SECRET in docker-compose_blob.yml
16 | 4. Update the KEYCLOAK_URL with local address
17 | 
18 | 
19 | curl -X POST "http://192.168.0.22:1260/authn/realms/inferx/protocol/openid-connect/token" \
20 |      -H "Content-Type: application/x-www-form-urlencoded" \
21 |      -d "client_id=infer_client" \
22 |      -d "client_secret=M2Dse5531tdtyipZdGizLEeoOVgziQRX" \
23 |      -d "username=testuser1" \
24 |      -d "password=test" \
25 |      -d "grant_type=password"
26 | 
27 | 


--------------------------------------------------------------------------------
/doc/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/logo.png


--------------------------------------------------------------------------------
/doc/logo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/logo1.png


--------------------------------------------------------------------------------
/doc/logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inferx-net/inferx/2d1f46d40f90271a92dd27bbfc0ec276e4a11823/doc/logo2.png


--------------------------------------------------------------------------------
/inferxlib/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "inferxlib"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | serde = { version = "1.0", features = ["derive"] }
10 | serde_json = "1.0"
11 | regex = "1.7.1"
12 | bollard = "=0.17.0"
13 | reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
14 | log = "0.4.17"
15 | log4rs = "1"
16 | 
17 | [dependencies.lazy_static]
18 | version = "1.0"
19 | features = ["spin_no_std"]
20 | 


--------------------------------------------------------------------------------
/inferxlib/src/common.rs:
--------------------------------------------------------------------------------
 1 | use serde_json::Error as SerdeJsonError;
 2 | 
 3 | pub type Result<T> = core::result::Result<T, Error>;
 4 | 
 5 | #[derive(Debug)]
 6 | pub enum Error {
 7 |     CommonError(String),
 8 |     NotExist(String),
 9 |     Exist(String),
10 |     SchedulerNoEnoughResource(String),
11 |     SerdeJsonError(SerdeJsonError),
12 |     StdIOErr(std::io::Error),
13 |     ReqWestErr(reqwest::Error),
14 | }
15 | 
16 | impl From<SerdeJsonError> for Error {
17 |     fn from(item: SerdeJsonError) -> Self {
18 |         return Self::SerdeJsonError(item);
19 |     }
20 | }
21 | 
22 | impl From<std::io::Error> for Error {
23 |     fn from(item: std::io::Error) -> Self {
24 |         return Self::StdIOErr(item);
25 |     }
26 | }
27 | 
28 | impl From<reqwest::Error> for Error {
29 |     fn from(item: reqwest::Error) -> Self {
30 |         return Self::ReqWestErr(item);
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/inferxlib/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![allow(dead_code)]
 2 | #![allow(non_snake_case)]
 3 | #![allow(non_upper_case_globals)]
 4 | #![allow(non_camel_case_types)]
 5 | #![allow(deprecated)]
 6 | #![allow(unused_imports)]
 7 | 
 8 | #[macro_use]
 9 | extern crate log;
10 | 
11 | pub mod common;
12 | pub mod data_obj;
13 | pub mod node;
14 | pub mod obj_mgr;
15 | pub mod resource;
16 | pub mod selector;
17 | pub mod validation;
18 | 


--------------------------------------------------------------------------------
/inferxlib/src/obj_mgr/cidrlock.rs:
--------------------------------------------------------------------------------
1 | use serde::{Deserialize, Serialize};
2 | 
3 | use crate::resource::NodeResources;
4 | 
5 | use crate::data_obj::*;
6 | 
7 | #[derive(Serialize, Deserialize, Debug, Clone, Default)]
8 | pub struct CidrlockSpec {}
9 | 


--------------------------------------------------------------------------------
/inferxlib/src/obj_mgr/mod.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021 Quark Container Authors / 2014 The Kubernetes Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | pub mod cidrlock;
16 | pub mod func_mgr;
17 | pub mod funcsnapshot_mgr;
18 | pub mod namespace_mgr;
19 | pub mod node_mgr;
20 | pub mod pod_mgr;
21 | pub mod tenant_mgr;
22 | 


--------------------------------------------------------------------------------
/inferxlib/src/obj_mgr/namespace_mgr.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021 Quark Container Authors / 2014 The Kubernetes Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | use serde::{Deserialize, Serialize};
16 | 
17 | use crate::data_obj::*;
18 | 
19 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
20 | pub struct NamespaceObject {
21 |     pub spec: NamespaceSpec,
22 |     pub status: NamespaceStatus,
23 | }
24 | 
25 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
26 | pub struct NamespaceStatus {
27 |     pub disable: bool,
28 | }
29 | 
30 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
31 | pub struct NamespaceSpec {}
32 | 
33 | pub type Namespace = DataObject<NamespaceObject>;
34 | pub type NamespaceMgr = DataObjectMgr<NamespaceObject>;
35 | 
36 | impl Namespace {
37 |     pub const KEY: &'static str = "namespace";
38 | }
39 | 


--------------------------------------------------------------------------------
/inferxlib/src/obj_mgr/node_mgr.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | use crate::resource::NodeResources;
 4 | 
 5 | use crate::data_obj::*;
 6 | 
 7 | #[derive(Serialize, Deserialize, Debug, Clone, Default)]
 8 | pub struct NodeSpec {
 9 |     pub nodeIp: String,
10 |     pub podMgrPort: u16,
11 |     pub tsotSvcPort: u16,
12 |     pub stateSvcPort: u16,
13 |     pub cidr: String,
14 |     pub resources: NodeResources,
15 |     pub blobStoreEnable: bool,
16 | }
17 | 
18 | pub type Node = DataObject<NodeSpec>;
19 | pub type NodeMgr = DataObjectMgr<NodeSpec>;
20 | 
21 | impl Node {
22 |     pub const KEY: &'static str = "node_info";
23 |     pub const TENANT: &'static str = "system";
24 |     pub const NAMESPACE: &'static str = "system";
25 | 
26 |     pub fn QletUrl(&self) -> String {
27 |         return format!("http://{}:{}", self.object.nodeIp, self.object.podMgrPort);
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/inferxlib/src/obj_mgr/tenant_mgr.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021 Quark Container Authors / 2014 The Kubernetes Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | use serde::{Deserialize, Serialize};
16 | 
17 | use crate::data_obj::*;
18 | 
19 | pub const SYSTEM_TENANT: &str = "system";
20 | pub const SYSTEM_NAMESPACE: &str = "system";
21 | 
22 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
23 | pub struct TenantObject {
24 |     pub spec: TenantSpec,
25 |     pub status: TenantStatus,
26 | }
27 | 
28 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
29 | pub struct TenantStatus {
30 |     pub disable: bool,
31 | }
32 | 
33 | #[derive(Serialize, Deserialize, Debug, Default, Clone)]
34 | pub struct TenantSpec {}
35 | 
36 | pub type Tenant = DataObject<TenantObject>;
37 | 
38 | impl Tenant {
39 |     pub const KEY: &'static str = "tenant";
40 | }
41 | 
42 | pub type TenantMgr = DataObjectMgr<TenantObject>;
43 | 


--------------------------------------------------------------------------------
/ixctl_logging_config.yaml:
--------------------------------------------------------------------------------
 1 | appenders:
 2 |   my_stdout:
 3 |     kind: console
 4 |     encoder:
 5 |       pattern: "{h({d(%Y-%m-%d %H:%M:%S)(utc)} - {l}: {m}{n})}"
 6 |   my_file_logger:
 7 |     kind: rolling_file
 8 |     path: "/opt/inferx/log/ixctl.log"
 9 |     encoder:
10 |       pattern: "{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}"
11 |     policy:
12 |       trigger:
13 |         kind: size
14 |         limit: 50mb
15 |       roller:
16 |         kind: delete
17 |   append_logger:
18 |     kind: file
19 |     path: "/opt/inferx/log/ixctl.log"
20 |     append: true
21 |     encoder:
22 |       pattern: "{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}"
23 | root:
24 |   level: info
25 |   appenders:
26 |     - append_logger
27 | 


--------------------------------------------------------------------------------
/k8s/clean-k3sagent.sh:
--------------------------------------------------------------------------------
 1 | # Stop K3s service if running
 2 | sudo systemctl stop k3s-agent || true
 3 | 
 4 | # Run the uninstall script if present
 5 | sudo /usr/local/bin/k3s-agent-uninstall.sh || true
 6 | 
 7 | # Clean residual data
 8 | sudo rm -rf /etc/rancher/k3s /var/lib/rancher/k3s /var/lib/kubelet /etc/systemd/system/k3s-agent.service /usr/local/bin/k3s*
 9 | 
10 | # Optionally clean containerd data if used before
11 | # sudo rm -rf /var/lib/containerd
12 | 
13 | echo "K3s agent cleanup complete."


--------------------------------------------------------------------------------
/k8s/cleanup-k3s.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euxo pipefail
 3 | 
 4 | ### 1. Stop & Uninstall K3s
 5 | if command -v k3s-uninstall.sh &> /dev/null; then
 6 |   sudo /usr/local/bin/k3s-uninstall.sh           # Stops k3s, removes services, data, etc. :contentReference[oaicite:0]{index=0}
 7 | fi
 8 | if command -v k3s-agent-uninstall.sh &> /dev/null; then
 9 |   sudo /usr/local/bin/k3s-agent-uninstall.sh     # Removes agent components on workers :contentReference[oaicite:1]{index=1}
10 | fi
11 | 
12 | ### 2. Kill any remaining processes
13 | if command -v k3s-killall.sh &> /dev/null; then
14 |   sudo /usr/local/bin/k3s-killall.sh             # Kills k3s-related processes, containerd, etc. :contentReference[oaicite:2]{index=2}
15 | fi
16 | 
17 | ### 3. Remove leftover dirs and configs
18 | # sudo rm -rf /etc/rancher/k3s /var/lib/rancher/k3s /var/lib/kubelet 
19 |             # /etc/containerd /var/lib/containerd           # Clean containerd and K3s state :contentReference[oaicite:3]{index=3}
20 | 
21 | sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/systemd/system/k3s* /var/lib/containerd /etc/cni /opt/cni
22 | 
23 | 
24 | ### 4. Restart containerd to clear any stuck state
25 | sudo systemctl restart containerd                     # Ensures containerd is fresh :contentReference[oaicite:4]{index=4}
26 | 
27 | echo "✔️  K3s and related components have been fully removed."
28 | 


--------------------------------------------------------------------------------
/k8s/dashboard.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: inferx-dashboard
 5 |   labels:
 6 |     app: inferx-dashboard
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: inferx-dashboard
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: inferx-dashboard
16 |     spec:
17 |       containers:
18 |         - name: inferx-dashboard
19 |           image: inferx/inferx_dashboard:v0.1.1
20 |           imagePullPolicy: IfNotPresent
21 |           env:
22 |             - name: KEYCLOAK_URL
23 |               value: "http://192.168.0.22:31260/authn"
24 |             - name: KEYCLOAK_REALM_NAME
25 |               value: "inferx"
26 |             - name: KEYCLOAK_CLIENT_ID
27 |               value: "infer_client"
28 |             - name: KEYCLOAK_CLIENT_SECRET
29 |               value: "M2Dse5531tdtyipZdGizLEeoOVgziQRX"
30 |             - name: INFERX_APIGW_ADDR
31 |               value: "http://nodeagent:4000"
32 |           volumeMounts:
33 |             - name: cert-volume
34 |               mountPath: /etc/letsencrypt/
35 |           livenessProbe:
36 |             httpGet:
37 |               path: /intro?name=home.md
38 |               port: 1250
39 |             initialDelaySeconds: 10
40 |             periodSeconds: 10
41 |       volumes:
42 |         - name: cert-volume
43 |           hostPath:
44 |             path: /etc/letsencrypt/
45 | ---
46 | apiVersion: v1
47 | kind: Service
48 | metadata:
49 |   name: inferx-dashboard
50 | spec:
51 |   type: NodePort
52 |   selector:
53 |     app: inferx-dashboard
54 |   ports:
55 |     - name: http
56 |       port: 1250
57 |       targetPort: 1250
58 |       nodePort: 31250  


--------------------------------------------------------------------------------
/k8s/db-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: db-pvc
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteOnce
 8 |   resources:
 9 |     requests:
10 |       storage: 1Gi
11 | ---
12 | apiVersion: apps/v1
13 | kind: Deployment
14 | metadata:
15 |   name: db
16 | spec:
17 |   replicas: 1
18 |   selector:
19 |     matchLabels:
20 |       app: db
21 |   template:
22 |     metadata:
23 |       labels:
24 |         app: db
25 |     spec:
26 |       nodeSelector:
27 |         inferx_storage: data      
28 |       containers:
29 |         - name: postgres
30 |           image: postgres:14.5
31 |           imagePullPolicy: IfNotPresent
32 |           env:
33 |             - name: POSTGRES_USER
34 |               value: audit_user
35 |             - name: POSTGRES_PASSWORD
36 |               value: "123456"
37 |             - name: POSTGRES_DB
38 |               value: auditdb
39 |             - name: PGDATA
40 |               value: /data/postgres
41 |           volumeMounts:
42 |             - name: db-data
43 |               mountPath: /data/postgres
44 |             - name: init-sql
45 |               mountPath: /docker-entrypoint-initdb.d/db.sql
46 |       volumes:
47 |         - name: db-data
48 |           hostPath:
49 |             path: /opt/inferx/data/postgres
50 |             type: DirectoryOrCreate
51 |         - name: init-sql
52 |           hostPath:
53 |             path: /opt/inferx/config/create_table.sql
54 |             type: File
55 | ---
56 | apiVersion: v1
57 | kind: Service
58 | metadata:
59 |   name: db
60 | spec:
61 |   selector:
62 |     app: db
63 |   ports:
64 |     - port: 5432
65 |       targetPort: 5432
66 |       nodePort: 30542
67 |   type: NodePort
68 | 


--------------------------------------------------------------------------------
/k8s/etcd.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: etcd
 5 |   labels:
 6 |     app: etcd
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: etcd
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: etcd
16 |     spec:
17 |       nodeSelector:
18 |         inferx_storage: data      
19 |       containers:
20 |         - name: etcd
21 |           image: quay.io/coreos/etcd:v3.5.13
22 |           imagePullPolicy: IfNotPresent
23 |           volumeMounts:
24 |             - name: etcd-data
25 |               mountPath: /opt/inferx/data/etcd
26 |           command: [ "etcd" ]
27 |           args:
28 |             - "--name=etcd-00"
29 |             - "--data-dir=/opt/inferx/data/etcd"
30 |             - "--advertise-client-urls=http://etcd-00:2379"
31 |             - "--listen-client-urls=http://0.0.0.0:2379"
32 |             - "--initial-advertise-peer-urls=http://etcd-00:2380"
33 |             - "--listen-peer-urls=http://0.0.0.0:2380"
34 |             - "--initial-cluster=etcd-00=http://etcd-00:2380"
35 |       volumes:
36 |         - name: etcd-data
37 |           hostPath:
38 |             path: /opt/inferx/data/etcd
39 |             type: DirectoryOrCreate
40 | ---
41 | apiVersion: v1
42 | kind: Service
43 | metadata:
44 |   name: etcd
45 | spec:
46 |   selector:
47 |     app: etcd
48 |   ports:
49 |     - port: 2379
50 |       targetPort: 2379
51 | 


--------------------------------------------------------------------------------
/k8s/ingress.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.k8s.io/v1
 2 | kind: Ingress
 3 | metadata:
 4 |   name: inferx-ingress
 5 |   annotations:
 6 |     nginx.ingress.kubernetes.io/use-regex: "true"
 7 |     nginx.ingress.kubernetes.io/rewrite-target: /$1$2
 8 |     nginx.ingress.kubernetes.io/proxy-buffering: "off"
 9 |     nginx.ingress.kubernetes.io/proxy-request-buffering: "off"
10 |     nginx.ingress.kubernetes.io/proxy-http-version: "1.1"
11 |     nginx.ingress.kubernetes.io/proxy-chunked: "on"
12 | spec:
13 |   rules:
14 |     - http:
15 |         paths:
16 |           - path: /funccall/
17 |             pathType: Prefix
18 |             backend:
19 |               service:
20 |                 name: nodeagent
21 |                 port:
22 |                   number: 4000
23 |           - path: /authn/
24 |             pathType: Prefix
25 |             backend:
26 |               service:
27 |                 name: keycloak
28 |                 port:
29 |                   number: 8080
30 |           - path: /
31 |             pathType: Prefix
32 |             backend:
33 |               service:
34 |                 name: inferx-dashboard
35 |                 port:
36 |                   number: 1250
37 |     ports:
38 |       web:
39 |         port: 80
40 |         hostPort: 80
41 |         expose: true
42 |       websecure:
43 |         port: 443
44 |         hostPort: 443
45 |         expose: true


--------------------------------------------------------------------------------
/k8s/install-k3s.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | 
 5 | ### 2. Install K3s using Docker runtime
 6 | echo "[+] Installing K3s with Docker as container runtime..."
 7 | curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--docker --node-external-ip=192.168.0.22" sh -
 8 | 
 9 | echo "[+] Waiting for K3s to be ready..."
10 | sleep 10
11 | kubectl get node
12 | 
13 | ### 3. Install Helm (if not installed)
14 | if ! command -v helm &> /dev/null; then
15 |   echo "[+] Installing Helm..."
16 |   curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash
17 | fi
18 | 
19 | ### 4. Add NVIDIA Helm repo
20 | echo "[+] Adding NVIDIA Helm repo..."
21 | helm repo add nvidia https://nvidia.github.io/gpu-operator
22 | helm repo update
23 | 
24 | ### 5. Deploy NVIDIA GPU Operator with Docker runtime
25 | echo "[+] Installing NVIDIA GPU Operator..."
26 | export KUBECONFIG=/etc/rancher/k3s/k3s.yam
27 | chmod 555 /etc/rancher/k3s/k3s.yaml
28 | helm install --wait gpu-operator \
29 |   nvidia/gpu-operator \
30 |   -n gpu-operator --create-namespace \
31 |   --set operator.defaultRuntime=docker \
32 |   --set driver.enabled=false \
33 |   --set toolkit.enabled=true
34 | 
35 | echo "[✓] K3s with Docker runtime and NVIDIA GPU Operator installed successfully."
36 | 


--------------------------------------------------------------------------------
/k8s/join-k3sagent.sh:
--------------------------------------------------------------------------------
 1 | # On server node
 2 | # sudo cat /var/lib/rancher/k3s/server/node-token
 3 | # hostname -I  # Use internal IP accessible by the joining node
 4 | 
 5 | 
 6 | sudo /usr/local/bin/k3s-agent-uninstall.sh
 7 | sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /var/lib/cni /run/flannel
 8 | 
 9 | 
10 | 
11 | curl -sfL https://get.k3s.io | K3S_URL=https://192.168.0.22:6443 \
12 |   K3S_TOKEN=K106218814e0f9ea4c0b067750e725aee4a2921804a6867b625abb51b5c11149e9a::server:5401cee22c6fd5315c24574784b8d8a1 \
13 |   INSTALL_K3S_EXEC="--docker --with-node-id" sh -
14 | 
15 | sudo k3s agent --docker \
16 |   --server https://192.168.0.22:6443 \
17 |   --token K106218814e0f9ea4c0b067750e725aee4a2921804a6867b625abb51b5c11149e9a::server:5401cee22c6fd5315c24574784b8d8a1 \
18 |   --with-node-id \
19 |   --node-name inferx-agent1 \
20 |   --debug
21 | 
22 | 
23 | # sudo k3s agent --docker --server https://192.168.0.22:6443 --token K106218814e0f9ea4c0b067750e725aee4a2921804a6867b625abb51b5c11149e9a::server:5401cee22c6fd5315c24574784b8d8a1 --debug
24 | 
25 | 


--------------------------------------------------------------------------------
/k8s/keycloak.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: keycloak
 5 | spec:
 6 |   replicas: 1
 7 |   selector:
 8 |     matchLabels:
 9 |       app: keycloak
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: keycloak
14 |     spec:
15 |       containers:
16 |         - name: keycloak
17 |           image: quay.io/keycloak/keycloak:latest
18 |           imagePullPolicy: IfNotPresent
19 |           args: ["start-dev", "--verbose"]
20 |           env:
21 |             - name: KEYCLOAK_ADMIN
22 |               value: admin
23 |             - name: KEYCLOAK_ADMIN_PASSWORD
24 |               value: admin
25 |             - name: KC_DB
26 |               value: postgres
27 |             - name: KC_DB_URL
28 |               value: jdbc:postgresql://keycloak-postgres:5432/keycloak
29 |             - name: KC_DB_USERNAME
30 |               value: keycloak
31 |             - name: KC_DB_PASSWORD
32 |               value: "123456"
33 |             - name: KC_HTTP_ENABLED
34 |               value: "true"
35 |             - name: KC_PROXY
36 |               value: edge
37 |             - name: KC_HOSTNAME_STRICT_HTTPS
38 |               value: "false"
39 |             - name: KC_HOSTNAME_STRICT
40 |               value: "false"
41 |             - name: KC_HTTP_RELATIVE_PATH
42 |               value: /authn
43 |           ports:
44 |             - containerPort: 8080
45 | ---
46 | apiVersion: v1
47 | kind: Service
48 | metadata:
49 |   name: keycloak
50 | spec:
51 |   type: NodePort
52 |   selector:
53 |     app: keycloak
54 |   ports:
55 |     - port: 8080
56 |       targetPort: 8080
57 |       nodePort: 31260  # Can customize between 30000–32767
58 | 


--------------------------------------------------------------------------------
/k8s/keycloak_postgres.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: keycloak-db-pvc
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteOnce
 8 |   resources:
 9 |     requests:
10 |       storage: 1Gi
11 | ---
12 | apiVersion: apps/v1
13 | kind: Deployment
14 | metadata:
15 |   name: keycloak-postgres
16 | spec:
17 |   replicas: 1
18 |   selector:
19 |     matchLabels:
20 |       app: keycloak-postgres
21 |   template:
22 |     metadata:
23 |       labels:
24 |         app: keycloak-postgres
25 |     spec:
26 |       nodeSelector:
27 |         inferx_storage: data      
28 |       containers:
29 |         - name: postgres
30 |           image: postgres:14.5
31 |           imagePullPolicy: IfNotPresent
32 |           env:
33 |             - name: POSTGRES_USER
34 |               value: keycloak
35 |             - name: POSTGRES_PASSWORD
36 |               value: "123456"
37 |             - name: POSTGRES_DB
38 |               value: keycloak
39 |             - name: PGDATA
40 |               value: /data/postgres
41 |           ports:
42 |             - containerPort: 5432
43 |           volumeMounts:
44 |             - name: db-data
45 |               mountPath: /data/postgres  
46 |       volumes:
47 |         - name: db-data
48 |           hostPath:
49 |             path: /opt/inferx/data/postgres_keycloak
50 |             type: DirectoryOrCreate
51 | ---
52 | apiVersion: v1
53 | kind: Service
54 | metadata:
55 |   name: keycloak-postgres
56 | spec:
57 |   selector:
58 |     app: keycloak-postgres
59 |   ports:
60 |     - port: 5432
61 |       targetPort: 5432
62 | 


--------------------------------------------------------------------------------
/k8s/nvidia-test.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: nvidia-test
 5 | spec:
 6 |   containers:
 7 |   - name: cuda-container
 8 |     image: nvidia/cuda:12.2.0-devel-ubuntu20.04
 9 |     imagePullPolicy: IfNotPresent
10 |     command: ["sleep", "infinity"]
11 |     resources:
12 |       limits:
13 |         nvidia.com/gpu: 1
14 |   nodeSelector:
15 |     kubernetes.io/hostname: brad-ms-7d46


--------------------------------------------------------------------------------
/k8s/scheduler.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: scheduler
 5 |   labels:
 6 |     app: scheduler
 7 | spec:
 8 |   replicas: 2
 9 |   selector:
10 |     matchLabels:
11 |       app: scheduler
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: scheduler
16 |     spec:
17 |       hostPID: true
18 |       containers:
19 |         - name: scheduler
20 |           image: inferx/inferx_one:v0.1.1
21 |           imagePullPolicy: IfNotPresent
22 |           env:
23 |           - name: POD_IP
24 |             valueFrom:
25 |               fieldRef:
26 |                 fieldPath: status.podIP          
27 |           - name: RUN_SERVICE
28 |             value: "Scheduler"
29 |           - name: STATESVC_ADDR
30 |             value: "http://statesvc:1237"
31 |           volumeMounts:
32 |             - mountPath: /opt/inferx/
33 |               name: opt-inferx
34 |           command: ["./onenode", "/opt/inferx/config/node.json"]
35 |       volumes:
36 |         - name: opt-inferx
37 |           hostPath:
38 |             path: /opt/inferx/       
39 | ---
40 | apiVersion: v1
41 | kind: Service
42 | metadata:
43 |   name: scheduler
44 | spec:
45 |   type: NodePort
46 |   selector:
47 |     app: scheduler
48 |   ports:
49 |     - name: http
50 |       port: 1238
51 |       targetPort: 1238
52 | 


--------------------------------------------------------------------------------
/k8s/secretdb.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: secret-db-pvc
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteOnce
 8 |   resources:
 9 |     requests:
10 |       storage: 1Gi
11 | ---
12 | apiVersion: apps/v1
13 | kind: Deployment
14 | metadata:
15 |   name: secret-db
16 | spec:      
17 |   replicas: 1
18 |   selector:
19 |     matchLabels:
20 |       app: secret-db
21 |   template:
22 |     metadata:
23 |       labels:
24 |         app: secret-db
25 |     spec:
26 |       nodeSelector:
27 |         inferx_storage: data
28 |       containers:
29 |         - name: postgres
30 |           image: postgres:14.5
31 |           imagePullPolicy: IfNotPresent
32 |           ports:
33 |             - containerPort: 5432
34 |           env:
35 |             - name: POSTGRES_USER
36 |               value: secret
37 |             - name: POSTGRES_PASSWORD
38 |               value: "123456"
39 |             - name: POSTGRES_DB
40 |               value: secretdb
41 |             - name: PGDATA
42 |               value: /data/postgres
43 |           volumeMounts:
44 |             - name: db-data
45 |               mountPath: /data/postgres
46 |             - name: init-sql
47 |               mountPath: /docker-entrypoint-initdb.d/db.sql
48 |       volumes:
49 |         - name: db-data
50 |           hostPath:
51 |             path: /opt/inferx/data/postgres_secret
52 |             type: DirectoryOrCreate
53 |         - name: init-sql
54 |           hostPath:
55 |             path: /opt/inferx/config/secret.sql
56 |             type: File
57 | ---
58 | apiVersion: v1
59 | kind: Service
60 | metadata:
61 |   name: secret-db
62 | spec:
63 |   selector:
64 |     app: secret-db
65 |   ports:
66 |     - port: 5432
67 |       targetPort: 5432
68 |       nodePort: 30541
69 |   type: NodePort
70 | 
71 | 


--------------------------------------------------------------------------------
/k8s/spdk.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: DaemonSet
 3 | metadata:
 4 |   name: spdk
 5 |   labels:
 6 |     app: spdk
 7 | spec:
 8 |   selector:
 9 |     matchLabels:
10 |       app: spdk
11 |   template:
12 |     metadata:
13 |       labels:
14 |         app: spdk
15 |     spec:
16 |       nodeSelector:
17 |         inferx_nodeType: inferx_blob   
18 |       hostNetwork: true
19 |       hostPID: true
20 |       containers:
21 |         - name: spdk
22 |           image: inferx/spdk-container2:v0.1.0
23 |           imagePullPolicy: IfNotPresent
24 |           securityContext:
25 |             privileged: true
26 |             runAsUser: 0
27 |           env:
28 |             - name: HUGEMEM
29 |               value: "64000"
30 |           volumeMounts:
31 |             - name: hugepages
32 |               mountPath: /dev/hugepages
33 |             - name: lib-modules
34 |               mountPath: /lib/modules
35 |             - name: opt-inferx
36 |               mountPath: /opt/inferx
37 |             - name: run-udev
38 |               mountPath: /run/udev
39 |       volumes:
40 |         - name: hugepages
41 |           hostPath:
42 |             path: /dev/hugepages
43 |         - name: lib-modules
44 |           hostPath:
45 |             path: /lib/modules
46 |         - name: opt-inferx
47 |           hostPath:
48 |             path: /opt/inferx
49 |         - name: run-udev
50 |           hostPath:
51 |             path: /run/udev
52 |       restartPolicy: Always
53 |       tolerations:
54 |         - operator: "Exists"  # Allow on tainted nodes
55 | 


--------------------------------------------------------------------------------
/k8s/statesvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: statesvc
 5 |   labels:
 6 |     app: statesvc
 7 | spec:
 8 |   replicas: 2
 9 |   selector:
10 |     matchLabels:
11 |       app: statesvc
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: statesvc
16 |     spec:
17 |       hostPID: true
18 |       containers:
19 |         - name: statesvc
20 |           image: inferx/inferx_one:v0.1.1
21 |           imagePullPolicy: IfNotPresent
22 |           env:
23 |           - name: POD_IP
24 |             valueFrom:
25 |               fieldRef:
26 |                 fieldPath: status.podIP          
27 |           - name: RUN_SERVICE
28 |             value: "StateSvc"
29 |           - name: CACHE_MEMORY
30 |             value: 20Gi
31 |           volumeMounts:
32 |             - mountPath: /opt/inferx/
33 |               name: opt-inferx
34 |           command: ["./onenode", "/opt/inferx/config/node.json"]
35 |       volumes:
36 |         - name: opt-inferx
37 |           hostPath:
38 |             path: /opt/inferx/       
39 | ---
40 | apiVersion: v1
41 | kind: Service
42 | metadata:
43 |   name: statesvc
44 | spec:
45 |   type: NodePort
46 |   selector:
47 |     app: statesvc
48 |   ports:
49 |     - name: http
50 |       port: 1237
51 |       targetPort: 1237
52 | 


--------------------------------------------------------------------------------
/nodeconfig/node.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nodeName": "node1",
 3 |     "etcdAddrs": [
 4 |         "http://etcd:2379"
 5 |     ],
 6 |     "hostIpCidr": "192.168.0.0/16",
 7 |     "podMgrPort": 1233,
 8 |     "tsotCniPort": 1234,
 9 |     "tsotSvcPort": 1235,
10 |     "qletStateSvcPort": 1236,
11 |     "statSvcPort": 1237,
12 |     "schedulerPort": 1238,
13 |     "gatewayPort": 4000,
14 |     "cidr": "10.1.3.0/8",
15 |     "stateSvcAddrs": [
16 |         "http://localhost:1237"
17 |     ],
18 |     "tsotSocketPath": "/opt/inferx/sockets/tsot-socket",
19 |     "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket",
20 |     "runService": true,
21 |     "auditdbAddr": "postgresql://audit_user:123456@db:5432/auditdb",
22 |     "resources": {
23 |         "CPU": 30000,
24 |         "Mem": 400000,
25 |         "GPUs": "Auto",
26 |         "ContextOverhead": 450,
27 |         "MaxContextPerGPU": 1
28 |     },
29 |     "snapshotDir": "/opt/inferx/snapshot",
30 |     "enableBlobStore": false,
31 |     "sharemem": {
32 |         "size": 20,
33 |         "hugepage": true
34 |     },
35 |     "tlsconfig": {
36 |         "enable": false,
37 |         "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem",
38 |         "key": "/etc/letsencrypt/live/inferx.net/privkey.pem"
39 |     },
40 |     "secretStoreAddr": "postgresql://secret:123456@secret-db:5432/secretdb",
41 |     "keycloakconfig": {
42 |         "url": "http://keycloak:8080/authn",
43 |         "realm": "inferx"
44 |     }
45 | }


--------------------------------------------------------------------------------
/nodeconfig/node1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nodeName": "node1",
 3 |     "etcdAddrs": [
 4 |         "http://localhost:2379"
 5 |     ],
 6 |     "hostIpCidr": "192.168.0.0/16",
 7 |     "podMgrPort": 1233,
 8 |     "tsotCniPort": 1234,
 9 |     "tsotSvcPort": 1235,
10 |     "qletStateSvcPort": 1236,
11 |     "statSvcPort": 1237,
12 |     "schedulerPort": 1238,
13 |     "gatewayPort": 4000,
14 |     "cidr": "10.1.3.0/8",
15 |     "stateSvcAddrs": [
16 |         "http://localhost:1237"
17 |     ],
18 |     "tsotSocketPath": "/opt/inferx/sockets/tsot-socket",
19 |     "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket",
20 |     "runService": true,
21 |     "auditdbAddr": "postgresql://audit_user:123456@localhost:5432/auditdb",
22 |     "resources": {
23 |         "CPU": 30000,
24 |         "Mem": 400000,
25 |         "GPUType": "A4000",
26 |         "GPUs": "Auto",
27 |         "ContextOverhead": 450,
28 |         "MaxContextPerGPU": 1
29 |     },
30 |     "snapshotDir": "/opt/inferx/snapshot",
31 |     "enableBlobStore": false,
32 |     "sharemem": {
33 |         "size": 20,
34 |         "hugepage": true
35 |     },
36 |     "tlsconfig": {
37 |         "enable": false,
38 |         "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem",
39 |         "key": "/etc/letsencrypt/live/inferx.net/privkey.pem"
40 |     },
41 |     "secretStoreAddr": "postgresql://secret:123456@localhost:5431/secretdb",
42 |     "keycloakconfig": {
43 |         "url": "http://localhost:1260/authn",
44 |         "realm": "inferx"
45 |     }
46 | }


--------------------------------------------------------------------------------
/nodeconfig/node2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nodeName": "node2",
 3 |     "etcdAddrs": [
 4 |         "http://localhost:2379"
 5 |     ],
 6 |     "hostIpCidr": "192.168.0.0/16",
 7 |     "podMgrPort": 1233,
 8 |     "tsotCniPort": 1234,
 9 |     "tsotSvcPort": 1235,
10 |     "qletStateSvcPort": 1236,
11 |     "statSvcPort": 1237,
12 |     "schedulerPort": 1238,
13 |     "gatewayPort": 4000,
14 |     "cidr": "10.1.2.0/8",
15 |     "stateSvcAddrs": [
16 |         "http://localhost:1237"
17 |     ],
18 |     "tsotSocketPath": "/var/run/quark/tsot-socket",
19 |     "tsotGwSocketPath": "/var/run/quark_host/tsot-socket",
20 |     "runService": false,
21 |     "auditdbAddr": "postgresql://audit_user:123456@localhost/auditdb",
22 |     "resources": {
23 |         "CPU": 30000,
24 |         "Mem": 300000,
25 |         "GPUType": "A4000",
26 |         "GPUs": "Auto",
27 |         "ContextOverhead": 450,
28 |         "MaxContextPerGPU": 2
29 |     },
30 |     "snapshotDir": "/snapshot",
31 |     "enableBlobStore": true
32 | }


--------------------------------------------------------------------------------
/nodeconfig/node3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nodeName": "node2",
 3 |     "etcdAddrs": [
 4 |         "http://localhost:2379"
 5 |     ],
 6 |     "hostIpCidr": "192.168.0.0/16",
 7 |     "podMgrPort": 1233,
 8 |     "tsotCniPort": 1234,
 9 |     "tsotSvcPort": 1235,
10 |     "qletStateSvcPort": 1236,
11 |     "statSvcPort": 1237,
12 |     "schedulerPort": 1238,
13 |     "gatewayPort": 4000,
14 |     "cidr": "10.1.2.0/8",
15 |     "stateSvcAddrs": [
16 |         "http://localhost:1237"
17 |     ],
18 |     "tsotSocketPath": "/opt/inferx/sockets/tsot-socket",
19 |     "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket",
20 |     "runService": true,
21 |     "auditdbAddr": "postgresql://audit_user:123456@localhost:5432/auditdb",
22 |     "resources": {
23 |         "CPU": 30000,
24 |         "Mem": 400000,
25 |         "GPUType": "A4000",
26 |         "GPUs": "Auto",
27 |         "ContextOverhead": 440,
28 |         "MaxContextPerGPU": 1
29 |     },
30 |     "snapshotDir": "/opt/inferx/snapshot",
31 |     "enableBlobStore": true,
32 |     "sharemem": {
33 |         "size": 36,
34 |         "hugepage": true
35 |     },
36 |     "tlsconfig": {
37 |         "enable": false,
38 |         "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem",
39 |         "key": "/etc/letsencrypt/live/inferx.net/privkey.pem"
40 |     },
41 |     "secretStoreAddr": "postgresql://secret:123456@localhost:5431/secretdb",
42 |     "keycloakconfig": {
43 |         "url": "http://localhost:1260/authn",
44 |         "realm": "inferx"
45 |     }
46 | }


--------------------------------------------------------------------------------
/nodeconfig/node4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nodeName": "node3",
 3 |     "etcdAddrs": [
 4 |         "http://localhost:2379"
 5 |     ],
 6 |     "hostIpCidr": "192.168.0.0/16",
 7 |     "podMgrPort": 1233,
 8 |     "tsotCniPort": 1234,
 9 |     "tsotSvcPort": 1235,
10 |     "qletStateSvcPort": 1236,
11 |     "statSvcPort": 1237,
12 |     "schedulerPort": 1238,
13 |     "gatewayPort": 4000,
14 |     "cidr": "10.1.2.0/8",
15 |     "stateSvcAddrs": [
16 |         "http://localhost:1237"
17 |     ],
18 |     "tsotSocketPath": "/opt/inferx/sockets/tsot-socket",
19 |     "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket",
20 |     "runService": true,
21 |     "auditdbAddr": "postgresql://audit_user:123456@localhost:30542/auditdb",
22 |     "resources": {
23 |         "CPU": 30000,
24 |         "Mem": 400000,
25 |         "GPUType": "A4000",
26 |         "GPUs": "Auto",
27 |         "ContextOverhead": 450,
28 |         "MaxContextPerGPU": 2
29 |     },
30 |     "snapshotDir": "/opt/inferx/snapshot",
31 |     "enableBlobStore": true,
32 |     "sharemem": {
33 |         "size": 36,
34 |         "hugepage": true
35 |     },
36 |     "tlsconfig": {
37 |         "enable": false,
38 |         "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem",
39 |         "key": "/etc/letsencrypt/live/inferx.net/privkey.pem"
40 |     },
41 |     "secretStoreAddr": "postgresql://secret:123456@localhost:30541/secretdb",
42 |     "keycloakconfig": {
43 |         "url": "http://localhost:31260",
44 |         "realm": "inferx",
45 |         "adminUser": "admin",
46 |         "adminPassword": "admin"
47 |     }
48 | }


--------------------------------------------------------------------------------
/nodeconfig/node_blob.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nodeName": "node1",
 3 |     "etcdAddrs": [
 4 |         "http://etcd:2379"
 5 |     ],
 6 |     "hostIpCidr": "192.168.0.0/16",
 7 |     "podMgrPort": 1233,
 8 |     "tsotCniPort": 1234,
 9 |     "tsotSvcPort": 1235,
10 |     "qletStateSvcPort": 1236,
11 |     "statSvcPort": 1237,
12 |     "schedulerPort": 1238,
13 |     "gatewayPort": 4000,
14 |     "cidr": "10.1.3.0/8",
15 |     "stateSvcAddrs": [
16 |         "http://localhost:1237"
17 |     ],
18 |     "tsotSocketPath": "/opt/inferx/sockets/tsot-socket",
19 |     "tsotGwSocketPath": "/opt/inferx/sockets_host/tsot-socket",
20 |     "runService": true,
21 |     "auditdbAddr": "postgresql://audit_user:123456@db:5432/auditdb",
22 |     "resources": {
23 |         "CPU": 30000,
24 |         "Mem": 400000,
25 |         "GPUType": "A4000",
26 |         "GPUs": "Auto",
27 |         "ContextOverhead": 450,
28 |         "MaxContextPerGPU": 1
29 |     },
30 |     "snapshotDir": "/opt/inferx/snapshot",
31 |     "enableBlobStore": true,
32 |     "sharemem": {
33 |         "size": 50,
34 |         "hugepage": true
35 |     },
36 |     "tlsconfig": {
37 |         "enable": false,
38 |         "cert": "/etc/letsencrypt/live/inferx.net/fullchain.pem",
39 |         "key": "/etc/letsencrypt/live/inferx.net/privkey.pem"
40 |     },
41 |     "secretStoreAddr": "postgresql://secret:123456@secret-db:5432/secretdb",
42 |     "keycloakconfig": {
43 |         "url": "http://keycloak:8080/authn",
44 |         "realm": "inferx"
45 |     }
46 | }


--------------------------------------------------------------------------------
/script/inferx_clean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PARENT_DIR="/opt/inferx/sandbox/"
 4 | INFERX_BIN="/opt/inferx/bin/inferx"
 5 | 
 6 | # pkill -9 inferx
 7 | 
 8 | for SUBDIR in "$PARENT_DIR"/*; do
 9 |   if [ -d "$SUBDIR" ]; then
10 |     SUBFOLDER_NAME=$(basename "$SUBDIR")
11 |     echo "Running inferx on: $SUBFOLDER_NAME"
12 |     "$INFERX_BIN" \
13 |       --root "/var/run/docker/runtime-runc/moby" \
14 |       --log-format json \
15 |       --systemd-cgroup delete "$SUBFOLDER_NAME"
16 |     
17 |   fi
18 | done


--------------------------------------------------------------------------------