├── README.md
├── bootstrap
    ├── gpu-machineset.sh
    └── nvidia.conf
├── docs
    ├── multi-gpu.png
    ├── quant.md
    └── tp-diagram.png
├── llm-servers
    ├── base
    │   ├── kustomization.yaml
    │   ├── llm-deployment.yaml
    │   ├── llm-pvc.yaml
    │   └── llm-svc.yaml
    └── overlays
    │   ├── falcon-40b
    │       ├── README.md
    │       ├── gpu_patch.yaml
    │       ├── kustomization.yaml
    │       ├── model_patch.yaml
    │       └── shm_patch.yaml
    │   ├── granite-8b
    │       ├── README.md
    │       ├── gpu_patch.yaml
    │       ├── kustomization.yaml
    │       ├── model_patch.yaml
    │       └── shm_patch.yaml
    │   ├── llama2-13b
    │       ├── README.md
    │       ├── gpu_patch.yaml
    │       ├── kustomization.yaml
    │       ├── model_patch.yaml
    │       └── shm_patch.yaml
    │   ├── llama3-7b
    │       ├── README.md
    │       ├── gpu_patch.yaml
    │       ├── kustomization.yaml
    │       ├── model_patch.yaml
    │       └── shm_patch.yaml
    │   ├── mistral-7b
    │       ├── README.md
    │       ├── gpu_patch.yaml
    │       ├── kustomization.yaml
    │       ├── model_patch.yaml
    │       └── shm_patch.yaml
    │   └── mixtral-8x7b
    │       ├── README.md
    │       ├── gpu_patch.yaml
    │       ├── kustomization.yaml
    │       ├── model_patch.yaml
    │       └── shm_patch.yaml
├── serving-runtimes
    ├── base
    │   ├── inference-server.yaml
    │   ├── kustomization.yaml
    │   └── serving-runtime.yaml
    └── overlays
    │   ├── granite-7B
    │       └── README.md
    │   └── llama3-8B
    │       └── README.md
└── test-notebooks
    ├── README.md
    └── vllm_rest_requests.ipynb


/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/README.md


--------------------------------------------------------------------------------
/bootstrap/gpu-machineset.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/bootstrap/gpu-machineset.sh


--------------------------------------------------------------------------------
/bootstrap/nvidia.conf:
--------------------------------------------------------------------------------
1 | NVreg_EnableGpuFirmware=0


--------------------------------------------------------------------------------
/docs/multi-gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/docs/multi-gpu.png


--------------------------------------------------------------------------------
/docs/quant.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/docs/quant.md


--------------------------------------------------------------------------------
/docs/tp-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/docs/tp-diagram.png


--------------------------------------------------------------------------------
/llm-servers/base/kustomization.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/base/kustomization.yaml


--------------------------------------------------------------------------------
/llm-servers/base/llm-deployment.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/base/llm-deployment.yaml


--------------------------------------------------------------------------------
/llm-servers/base/llm-pvc.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/base/llm-pvc.yaml


--------------------------------------------------------------------------------
/llm-servers/base/llm-svc.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/base/llm-svc.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/falcon-40b/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/falcon-40b/README.md


--------------------------------------------------------------------------------
/llm-servers/overlays/falcon-40b/gpu_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/falcon-40b/gpu_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/falcon-40b/kustomization.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/falcon-40b/kustomization.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/falcon-40b/model_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/falcon-40b/model_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/falcon-40b/shm_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/falcon-40b/shm_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/granite-8b/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/granite-8b/README.md


--------------------------------------------------------------------------------
/llm-servers/overlays/granite-8b/gpu_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/granite-8b/gpu_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/granite-8b/kustomization.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/granite-8b/kustomization.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/granite-8b/model_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/granite-8b/model_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/granite-8b/shm_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/granite-8b/shm_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/llama2-13b/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama2-13b/README.md


--------------------------------------------------------------------------------
/llm-servers/overlays/llama2-13b/gpu_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama2-13b/gpu_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/llama2-13b/kustomization.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama2-13b/kustomization.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/llama2-13b/model_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama2-13b/model_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/llama2-13b/shm_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama2-13b/shm_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/llama3-7b/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama3-7b/README.md


--------------------------------------------------------------------------------
/llm-servers/overlays/llama3-7b/gpu_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama3-7b/gpu_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/llama3-7b/kustomization.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama3-7b/kustomization.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/llama3-7b/model_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama3-7b/model_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/llama3-7b/shm_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama3-7b/shm_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/mistral-7b/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mistral-7b/README.md


--------------------------------------------------------------------------------
/llm-servers/overlays/mistral-7b/gpu_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mistral-7b/gpu_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/mistral-7b/kustomization.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mistral-7b/kustomization.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/mistral-7b/model_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mistral-7b/model_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/mistral-7b/shm_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mistral-7b/shm_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/mixtral-8x7b/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mixtral-8x7b/README.md


--------------------------------------------------------------------------------
/llm-servers/overlays/mixtral-8x7b/gpu_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mixtral-8x7b/gpu_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/mixtral-8x7b/kustomization.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mixtral-8x7b/kustomization.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/mixtral-8x7b/model_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mixtral-8x7b/model_patch.yaml


--------------------------------------------------------------------------------
/llm-servers/overlays/mixtral-8x7b/shm_patch.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mixtral-8x7b/shm_patch.yaml


--------------------------------------------------------------------------------
/serving-runtimes/base/inference-server.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/serving-runtimes/base/inference-server.yaml


--------------------------------------------------------------------------------
/serving-runtimes/base/kustomization.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/serving-runtimes/base/kustomization.yaml


--------------------------------------------------------------------------------
/serving-runtimes/base/serving-runtime.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/serving-runtimes/base/serving-runtime.yaml


--------------------------------------------------------------------------------
/serving-runtimes/overlays/granite-7B/README.md:
--------------------------------------------------------------------------------
1 | ## Granite 7B


--------------------------------------------------------------------------------
/serving-runtimes/overlays/llama3-8B/README.md:
--------------------------------------------------------------------------------
1 | ## Llama3-8B


--------------------------------------------------------------------------------
/test-notebooks/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/test-notebooks/README.md


--------------------------------------------------------------------------------
/test-notebooks/vllm_rest_requests.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/test-notebooks/vllm_rest_requests.ipynb


--------------------------------------------------------------------------------