├── README.md ├── bootstrap ├── gpu-machineset.sh └── nvidia.conf ├── docs ├── multi-gpu.png ├── quant.md └── tp-diagram.png ├── llm-servers ├── base │ ├── kustomization.yaml │ ├── llm-deployment.yaml │ ├── llm-pvc.yaml │ └── llm-svc.yaml └── overlays │ ├── falcon-40b │ ├── README.md │ ├── gpu_patch.yaml │ ├── kustomization.yaml │ ├── model_patch.yaml │ └── shm_patch.yaml │ ├── granite-8b │ ├── README.md │ ├── gpu_patch.yaml │ ├── kustomization.yaml │ ├── model_patch.yaml │ └── shm_patch.yaml │ ├── llama2-13b │ ├── README.md │ ├── gpu_patch.yaml │ ├── kustomization.yaml │ ├── model_patch.yaml │ └── shm_patch.yaml │ ├── llama3-7b │ ├── README.md │ ├── gpu_patch.yaml │ ├── kustomization.yaml │ ├── model_patch.yaml │ └── shm_patch.yaml │ ├── mistral-7b │ ├── README.md │ ├── gpu_patch.yaml │ ├── kustomization.yaml │ ├── model_patch.yaml │ └── shm_patch.yaml │ └── mixtral-8x7b │ ├── README.md │ ├── gpu_patch.yaml │ ├── kustomization.yaml │ ├── model_patch.yaml │ └── shm_patch.yaml ├── serving-runtimes ├── base │ ├── inference-server.yaml │ ├── kustomization.yaml │ └── serving-runtime.yaml └── overlays │ ├── granite-7B │ └── README.md │ └── llama3-8B │ └── README.md └── test-notebooks ├── README.md └── vllm_rest_requests.ipynb /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/README.md -------------------------------------------------------------------------------- /bootstrap/gpu-machineset.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/bootstrap/gpu-machineset.sh -------------------------------------------------------------------------------- /bootstrap/nvidia.conf: -------------------------------------------------------------------------------- 1 | NVreg_EnableGpuFirmware=0 -------------------------------------------------------------------------------- /docs/multi-gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/docs/multi-gpu.png -------------------------------------------------------------------------------- /docs/quant.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/docs/quant.md -------------------------------------------------------------------------------- /docs/tp-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/docs/tp-diagram.png -------------------------------------------------------------------------------- /llm-servers/base/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/base/kustomization.yaml -------------------------------------------------------------------------------- /llm-servers/base/llm-deployment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/base/llm-deployment.yaml -------------------------------------------------------------------------------- /llm-servers/base/llm-pvc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/base/llm-pvc.yaml -------------------------------------------------------------------------------- /llm-servers/base/llm-svc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/base/llm-svc.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/falcon-40b/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/falcon-40b/README.md -------------------------------------------------------------------------------- /llm-servers/overlays/falcon-40b/gpu_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/falcon-40b/gpu_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/falcon-40b/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/falcon-40b/kustomization.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/falcon-40b/model_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/falcon-40b/model_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/falcon-40b/shm_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/falcon-40b/shm_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/granite-8b/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/granite-8b/README.md -------------------------------------------------------------------------------- /llm-servers/overlays/granite-8b/gpu_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/granite-8b/gpu_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/granite-8b/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/granite-8b/kustomization.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/granite-8b/model_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/granite-8b/model_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/granite-8b/shm_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/granite-8b/shm_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/llama2-13b/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama2-13b/README.md -------------------------------------------------------------------------------- /llm-servers/overlays/llama2-13b/gpu_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama2-13b/gpu_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/llama2-13b/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama2-13b/kustomization.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/llama2-13b/model_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama2-13b/model_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/llama2-13b/shm_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama2-13b/shm_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/llama3-7b/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama3-7b/README.md -------------------------------------------------------------------------------- /llm-servers/overlays/llama3-7b/gpu_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama3-7b/gpu_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/llama3-7b/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama3-7b/kustomization.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/llama3-7b/model_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama3-7b/model_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/llama3-7b/shm_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/llama3-7b/shm_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/mistral-7b/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mistral-7b/README.md -------------------------------------------------------------------------------- /llm-servers/overlays/mistral-7b/gpu_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mistral-7b/gpu_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/mistral-7b/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mistral-7b/kustomization.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/mistral-7b/model_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mistral-7b/model_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/mistral-7b/shm_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mistral-7b/shm_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/mixtral-8x7b/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mixtral-8x7b/README.md -------------------------------------------------------------------------------- /llm-servers/overlays/mixtral-8x7b/gpu_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mixtral-8x7b/gpu_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/mixtral-8x7b/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mixtral-8x7b/kustomization.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/mixtral-8x7b/model_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mixtral-8x7b/model_patch.yaml -------------------------------------------------------------------------------- /llm-servers/overlays/mixtral-8x7b/shm_patch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/llm-servers/overlays/mixtral-8x7b/shm_patch.yaml -------------------------------------------------------------------------------- /serving-runtimes/base/inference-server.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/serving-runtimes/base/inference-server.yaml -------------------------------------------------------------------------------- /serving-runtimes/base/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/serving-runtimes/base/kustomization.yaml -------------------------------------------------------------------------------- /serving-runtimes/base/serving-runtime.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/serving-runtimes/base/serving-runtime.yaml -------------------------------------------------------------------------------- /serving-runtimes/overlays/granite-7B/README.md: -------------------------------------------------------------------------------- 1 | ## Granite 7B -------------------------------------------------------------------------------- /serving-runtimes/overlays/llama3-8B/README.md: -------------------------------------------------------------------------------- 1 | ## Llama3-8B -------------------------------------------------------------------------------- /test-notebooks/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/test-notebooks/README.md -------------------------------------------------------------------------------- /test-notebooks/vllm_rest_requests.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rh-aiservices-bu/multi-gpu-llms/HEAD/test-notebooks/vllm_rest_requests.ipynb --------------------------------------------------------------------------------