├── .github
└── workflows
│ ├── depot-amd.yml
│ ├── depot-cpu.yml
│ ├── depot-nvidia-8.0.yml
│ ├── depot-nvidia-8.6.yml
│ ├── depot-nvidia.yml
│ └── unit-tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── Faq.md
├── LICENSE
├── README.md
├── cmd
├── bashly-settings.yml
├── bashly.sh
├── bashly.yml
├── benchmark_command.sh
├── build_image_command.sh
├── depot_build_command.sh
├── lib
│ └── colors.sh
├── llm_logs_command.sh
├── llm_ls_command.sh
├── llm_plot_command.sh
├── llm_squeue_command.sh
├── pypi_command.sh
├── test_command.sh
└── up_command.sh
├── deployment
├── ansible
│ ├── hosts
│ └── k8.yaml
└── helm
│ ├── amd_multi_node
│ └── scalarlm
│ │ ├── Chart.yaml
│ │ ├── templates
│ │ ├── _helpers.tpl
│ │ ├── api_configmap.yaml
│ │ ├── api_deployment.yaml
│ │ ├── api_service.yaml
│ │ ├── vllm_configmap.yaml
│ │ ├── vllm_deployment.yaml
│ │ └── vllm_service.yaml
│ │ └── values.yaml
│ ├── amd_single_node
│ └── scalarlm
│ │ ├── Chart.yaml
│ │ ├── local-hostpath-sc.yaml
│ │ ├── storageclass-clusterrole.yaml
│ │ ├── templates
│ │ ├── _helpers.tpl
│ │ ├── api_configmap.yaml
│ │ ├── api_deployment.yaml
│ │ ├── api_service.yaml
│ │ ├── cache_pvc.yaml
│ │ ├── jobs_pvc.yaml
│ │ ├── vllm_configmap.yaml
│ │ ├── vllm_deployment.yaml
│ │ └── vllm_service.yaml
│ │ └── values.yaml
│ ├── amd_single_pod
│ └── scalarlm
│ │ ├── Chart.yaml
│ │ ├── templates
│ │ ├── _helpers.tpl
│ │ ├── configmap.yaml
│ │ ├── deployment.yaml
│ │ └── service.yaml
│ │ └── values.yaml
│ ├── cray
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── templates
│ │ ├── NOTES.txt
│ │ ├── _helpers.tpl
│ │ ├── deployment.yaml
│ │ ├── hpa.yaml
│ │ ├── ingress.yaml
│ │ ├── service.yaml
│ │ ├── serviceaccount.yaml
│ │ └── tests
│ │ │ └── test-connection.yaml
│ └── values.yaml
│ ├── lambda
│ └── cray
│ │ ├── Chart.yaml
│ │ ├── templates
│ │ ├── _helpers.tpl
│ │ ├── configmap.yaml
│ │ ├── deployment.yaml
│ │ └── service.yaml
│ │ └── values.yaml
│ ├── minikube
│ └── cray
│ │ ├── Chart.yaml
│ │ ├── templates
│ │ ├── _helpers.tpl
│ │ ├── deployment.yaml
│ │ └── service.yaml
│ │ └── values.yaml
│ ├── tensorwave3b
│ └── scalarlm
│ │ ├── Chart.yaml
│ │ ├── templates
│ │ ├── _helpers.tpl
│ │ ├── api_configmap.yaml
│ │ ├── api_deployment.yaml
│ │ ├── api_service.yaml
│ │ ├── cache_pvc.yaml
│ │ ├── jobs_pvc.yaml
│ │ ├── vllm_configmap.yaml
│ │ ├── vllm_deployment.yaml
│ │ └── vllm_service.yaml
│ │ └── values.yaml
│ ├── tensorwave70b
│ └── scalarlm
│ │ ├── Chart.yaml
│ │ ├── templates
│ │ ├── _helpers.tpl
│ │ ├── api_configmap.yaml
│ │ ├── api_deployment.yaml
│ │ ├── api_service.yaml
│ │ ├── cache_pvc.yaml
│ │ ├── jobs_pvc.yaml
│ │ ├── vllm_configmap.yaml
│ │ ├── vllm_deployment.yaml
│ │ └── vllm_service.yaml
│ │ └── values.yaml
│ └── tensorwave8b
│ └── scalarlm
│ ├── Chart.yaml
│ ├── templates
│ ├── _helpers.tpl
│ ├── api_configmap.yaml
│ ├── api_deployment.yaml
│ ├── api_service.yaml
│ ├── cache_pvc.yaml
│ ├── jobs_pvc.yaml
│ ├── vllm_configmap.yaml
│ ├── vllm_deployment.yaml
│ └── vllm_service.yaml
│ └── values.yaml
├── docker-compose.yaml
├── docs
├── cray-docs
│ ├── docs
│ │ ├── arch.md
│ │ ├── assets
│ │ │ ├── cray-arch.png
│ │ │ ├── cray.jpeg
│ │ │ └── loss_plot_044db4ac60.png
│ │ ├── cli
│ │ │ ├── cli.md
│ │ │ ├── list-models.md
│ │ │ ├── plot.md
│ │ │ ├── squeue.md
│ │ │ └── training-logs.md
│ │ ├── contact.md
│ │ ├── deployment
│ │ │ ├── docker.md
│ │ │ ├── kubernetes.md
│ │ │ ├── laptop.md
│ │ │ ├── modal-details.md
│ │ │ └── modal.md
│ │ ├── index.md
│ │ ├── inference.md
│ │ ├── quickstart.md
│ │ └── training.md
│ └── mkdocs.yml
└── deploy.sh
├── frontend
└── assets
│ └── logo.svg
├── infra
├── CMakeLists.txt
├── cmake
│ ├── cpu_extension.cmake
│ ├── hipify.py
│ └── utils.cmake
├── cray_infra
│ ├── api
│ │ ├── fastapi
│ │ │ ├── aiohttp
│ │ │ │ └── get_global_session.py
│ │ │ ├── generate
│ │ │ │ ├── embed.py
│ │ │ │ ├── finish_work.py
│ │ │ │ ├── generate.py
│ │ │ │ ├── get_results.py
│ │ │ │ ├── get_work.py
│ │ │ │ └── poll_for_responses.py
│ │ │ ├── health
│ │ │ │ └── check_health.py
│ │ │ ├── main.py
│ │ │ ├── routers
│ │ │ │ ├── generate_router.py
│ │ │ │ ├── health_router.py
│ │ │ │ ├── megatron_router.py
│ │ │ │ ├── openai_router.py
│ │ │ │ └── request_types
│ │ │ │ │ ├── embed_request.py
│ │ │ │ │ ├── finish_work_request.py
│ │ │ │ │ ├── generate_request.py
│ │ │ │ │ ├── generate_response.py
│ │ │ │ │ ├── get_results_request.py
│ │ │ │ │ ├── get_results_response.py
│ │ │ │ │ ├── get_work_request.py
│ │ │ │ │ ├── get_work_response.py
│ │ │ │ │ ├── list_models_response.py
│ │ │ │ │ ├── squeue_response.py
│ │ │ │ │ └── train_request.py
│ │ │ └── tasks
│ │ │ │ └── add_megatron_tasks.py
│ │ └── work_queue
│ │ │ └── inference_work_queue.py
│ ├── generate
│ │ └── clear_acked_requests_from_queue.py
│ ├── one_server
│ │ ├── create_api.py
│ │ ├── create_vllm.py
│ │ ├── main.py
│ │ ├── start_cray_server.py
│ │ └── wait_for_vllm.py
│ ├── slurm
│ │ └── discovery
│ │ │ └── discover_clusters.py
│ ├── training
│ │ ├── distribution_strategy
│ │ │ └── fsdp
│ │ │ │ └── fsdp.py
│ │ ├── get_latest_model.py
│ │ ├── get_training_job_info.py
│ │ ├── gpu_aware_mpi
│ │ │ ├── gpu_aware_mpi.cpp
│ │ │ └── setup.py
│ │ ├── launch_training_job.py
│ │ ├── list_models.py
│ │ ├── metrics.py
│ │ ├── print_logo.py
│ │ ├── register_megatron_models.py
│ │ ├── restart_megatron_jobs.py
│ │ ├── squeue.py
│ │ ├── training_harness.py
│ │ ├── training_job_status.py
│ │ ├── training_logs_generator.py
│ │ ├── upload_training_data.py
│ │ └── vllm_model_manager.py
│ ├── util
│ │ ├── default_config.py
│ │ ├── default_job_config.py
│ │ ├── get_config.py
│ │ └── get_job_config.py
│ └── vllm
│ │ ├── __init__.py
│ │ ├── _core_ext.py
│ │ ├── _custom_ops.py
│ │ ├── _ipex_ops.py
│ │ ├── _version.py
│ │ ├── adapter_commons
│ │ ├── __init__.py
│ │ ├── layers.py
│ │ ├── models.py
│ │ ├── request.py
│ │ ├── utils.py
│ │ └── worker_manager.py
│ │ ├── assets
│ │ ├── __init__.py
│ │ ├── audio.py
│ │ ├── base.py
│ │ ├── image.py
│ │ └── video.py
│ │ ├── attention
│ │ ├── __init__.py
│ │ ├── backends
│ │ │ ├── __init__.py
│ │ │ ├── abstract.py
│ │ │ ├── blocksparse_attn.py
│ │ │ ├── flash_attn.py
│ │ │ ├── flashinfer.py
│ │ │ ├── ipex_attn.py
│ │ │ ├── openvino.py
│ │ │ ├── pallas.py
│ │ │ ├── rocm_flash_attn.py
│ │ │ ├── torch_sdpa.py
│ │ │ ├── utils.py
│ │ │ └── xformers.py
│ │ ├── layer.py
│ │ ├── ops
│ │ │ ├── __init__.py
│ │ │ ├── blocksparse_attention
│ │ │ │ ├── __init__.py
│ │ │ │ ├── blocksparse_attention_kernel.py
│ │ │ │ ├── interface.py
│ │ │ │ └── utils.py
│ │ │ ├── ipex_attn.py
│ │ │ ├── paged_attn.py
│ │ │ ├── prefix_prefill.py
│ │ │ └── triton_flash_attention.py
│ │ └── selector.py
│ │ ├── beam_search.py
│ │ ├── block.py
│ │ ├── compilation
│ │ ├── __init__.py
│ │ ├── backends.py
│ │ └── wrapper.py
│ │ ├── config.py
│ │ ├── connections.py
│ │ ├── core
│ │ ├── __init__.py
│ │ ├── block
│ │ │ ├── __init__.py
│ │ │ ├── block_table.py
│ │ │ ├── common.py
│ │ │ ├── cpu_gpu_block_allocator.py
│ │ │ ├── interfaces.py
│ │ │ ├── naive_block.py
│ │ │ ├── prefix_caching_block.py
│ │ │ └── utils.py
│ │ ├── block_manager_v1.py
│ │ ├── block_manager_v2.py
│ │ ├── embedding_model_block_manager.py
│ │ ├── evictor_v1.py
│ │ ├── evictor_v2.py
│ │ ├── interfaces.py
│ │ └── scheduler.py
│ │ ├── distributed
│ │ ├── __init__.py
│ │ ├── communication_op.py
│ │ ├── device_communicators
│ │ │ ├── __init__.py
│ │ │ ├── cuda_wrapper.py
│ │ │ ├── custom_all_reduce.py
│ │ │ ├── custom_all_reduce_utils.py
│ │ │ ├── pynccl.py
│ │ │ ├── pynccl_wrapper.py
│ │ │ ├── shm_broadcast.py
│ │ │ └── tpu_communicator.py
│ │ ├── parallel_state.py
│ │ └── utils.py
│ │ ├── engine
│ │ ├── __init__.py
│ │ ├── arg_utils.py
│ │ ├── async_llm_engine.py
│ │ ├── async_timeout.py
│ │ ├── llm_engine.py
│ │ ├── metrics.py
│ │ ├── metrics_types.py
│ │ ├── multiprocessing
│ │ │ ├── __init__.py
│ │ │ ├── client.py
│ │ │ └── engine.py
│ │ ├── output_processor
│ │ │ ├── __init__.py
│ │ │ ├── interfaces.py
│ │ │ ├── multi_step.py
│ │ │ ├── single_step.py
│ │ │ ├── stop_checker.py
│ │ │ └── util.py
│ │ └── protocol.py
│ │ ├── entrypoints
│ │ ├── __init__.py
│ │ ├── api_server.py
│ │ ├── chat_utils.py
│ │ ├── launcher.py
│ │ ├── llm.py
│ │ ├── logger.py
│ │ └── openai
│ │ │ ├── __init__.py
│ │ │ ├── api_server.py
│ │ │ ├── cli_args.py
│ │ │ ├── logits_processors.py
│ │ │ ├── protocol.py
│ │ │ ├── run_batch.py
│ │ │ ├── serving_chat.py
│ │ │ ├── serving_completion.py
│ │ │ ├── serving_embedding.py
│ │ │ ├── serving_engine.py
│ │ │ ├── serving_tokenization.py
│ │ │ └── tool_parsers
│ │ │ ├── __init__.py
│ │ │ ├── abstract_tool_parser.py
│ │ │ ├── hermes_tool_parser.py
│ │ │ ├── internlm2_tool_parser.py
│ │ │ ├── llama_tool_parser.py
│ │ │ ├── mistral_tool_parser.py
│ │ │ └── utils.py
│ │ ├── envs.py
│ │ ├── executor
│ │ ├── __init__.py
│ │ ├── cpu_executor.py
│ │ ├── distributed_gpu_executor.py
│ │ ├── executor_base.py
│ │ ├── gpu_executor.py
│ │ ├── msgspec_utils.py
│ │ ├── multiproc_gpu_executor.py
│ │ ├── multiproc_worker_utils.py
│ │ ├── multiproc_xpu_executor.py
│ │ ├── neuron_executor.py
│ │ ├── openvino_executor.py
│ │ ├── ray_gpu_executor.py
│ │ ├── ray_tpu_executor.py
│ │ ├── ray_utils.py
│ │ ├── ray_xpu_executor.py
│ │ ├── tpu_executor.py
│ │ └── xpu_executor.py
│ │ ├── forward_context.py
│ │ ├── inputs
│ │ ├── __init__.py
│ │ ├── data.py
│ │ ├── parse.py
│ │ ├── preprocess.py
│ │ └── registry.py
│ │ ├── logger.py
│ │ ├── logging
│ │ ├── __init__.py
│ │ └── formatter.py
│ │ ├── lora
│ │ ├── __init__.py
│ │ ├── fully_sharded_layers.py
│ │ ├── layers.py
│ │ ├── lora.py
│ │ ├── models.py
│ │ ├── ops
│ │ │ ├── __init__.py
│ │ │ ├── bgmv_expand.py
│ │ │ ├── bgmv_expand_slice.py
│ │ │ ├── bgmv_shrink.py
│ │ │ ├── sgmv_expand.py
│ │ │ ├── sgmv_expand_slice.py
│ │ │ ├── sgmv_shrink.py
│ │ │ └── utils.py
│ │ ├── punica.py
│ │ ├── request.py
│ │ ├── utils.py
│ │ └── worker_manager.py
│ │ ├── model_executor
│ │ ├── __init__.py
│ │ ├── custom_op.py
│ │ ├── guided_decoding
│ │ │ ├── __init__.py
│ │ │ ├── guided_fields.py
│ │ │ ├── lm_format_enforcer_decoding.py
│ │ │ ├── outlines_decoding.py
│ │ │ └── outlines_logits_processors.py
│ │ ├── layers
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── fused_moe
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configs
│ │ │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ │ │ ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ └── README
│ │ │ │ ├── fused_marlin_moe.py
│ │ │ │ ├── fused_moe.py
│ │ │ │ ├── layer.py
│ │ │ │ └── moe_pallas.py
│ │ │ ├── layernorm.py
│ │ │ ├── linear.py
│ │ │ ├── logits_processor.py
│ │ │ ├── mamba
│ │ │ │ ├── __init__.py
│ │ │ │ └── ops
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── causal_conv1d.py
│ │ │ │ │ └── mamba_ssm.py
│ │ │ ├── pooler.py
│ │ │ ├── quantization
│ │ │ │ ├── __init__.py
│ │ │ │ ├── aqlm.py
│ │ │ │ ├── awq.py
│ │ │ │ ├── awq_marlin.py
│ │ │ │ ├── awq_triton.py
│ │ │ │ ├── base_config.py
│ │ │ │ ├── bitsandbytes.py
│ │ │ │ ├── compressed_tensors
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── compressed_tensors.py
│ │ │ │ │ ├── compressed_tensors_moe.py
│ │ │ │ │ ├── schemes
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── compressed_tensors_scheme.py
│ │ │ │ │ │ ├── compressed_tensors_w4a16_24.py
│ │ │ │ │ │ ├── compressed_tensors_w8a16_fp8.py
│ │ │ │ │ │ ├── compressed_tensors_w8a8_fp8.py
│ │ │ │ │ │ ├── compressed_tensors_w8a8_int8.py
│ │ │ │ │ │ └── compressed_tensors_wNa16.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── deepspeedfp.py
│ │ │ │ ├── experts_int8.py
│ │ │ │ ├── fbgemm_fp8.py
│ │ │ │ ├── fp8.py
│ │ │ │ ├── gguf.py
│ │ │ │ ├── gptq.py
│ │ │ │ ├── gptq_marlin.py
│ │ │ │ ├── gptq_marlin_24.py
│ │ │ │ ├── ipex_quant.py
│ │ │ │ ├── kernels
│ │ │ │ │ ├── MPLinearKernel.py
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── machete.py
│ │ │ │ │ └── marlin.py
│ │ │ │ ├── kv_cache.py
│ │ │ │ ├── marlin.py
│ │ │ │ ├── modelopt.py
│ │ │ │ ├── neuron_quant.py
│ │ │ │ ├── qqq.py
│ │ │ │ ├── schema.py
│ │ │ │ ├── tpu_int8.py
│ │ │ │ └── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── layer_utils.py
│ │ │ │ │ ├── machete_utils.py
│ │ │ │ │ ├── marlin_utils.py
│ │ │ │ │ ├── marlin_utils_fp8.py
│ │ │ │ │ ├── marlin_utils_test.py
│ │ │ │ │ ├── marlin_utils_test_24.py
│ │ │ │ │ ├── marlin_utils_test_qqq.py
│ │ │ │ │ ├── quant_utils.py
│ │ │ │ │ └── w8a8_utils.py
│ │ │ ├── rejection_sampler.py
│ │ │ ├── resampler.py
│ │ │ ├── rotary_embedding.py
│ │ │ ├── sampler.py
│ │ │ ├── spec_decode_base_sampler.py
│ │ │ ├── typical_acceptance_sampler.py
│ │ │ └── vocab_parallel_embedding.py
│ │ ├── model_loader
│ │ │ ├── __init__.py
│ │ │ ├── loader.py
│ │ │ ├── neuron.py
│ │ │ ├── openvino.py
│ │ │ ├── tensorizer.py
│ │ │ ├── utils.py
│ │ │ └── weight_utils.py
│ │ ├── models
│ │ │ ├── __init__.py
│ │ │ ├── arctic.py
│ │ │ ├── baichuan.py
│ │ │ ├── bart.py
│ │ │ ├── blip.py
│ │ │ ├── blip2.py
│ │ │ ├── bloom.py
│ │ │ ├── chameleon.py
│ │ │ ├── chatglm.py
│ │ │ ├── clip.py
│ │ │ ├── commandr.py
│ │ │ ├── dbrx.py
│ │ │ ├── decilm.py
│ │ │ ├── deepseek.py
│ │ │ ├── deepseek_v2.py
│ │ │ ├── eagle.py
│ │ │ ├── exaone.py
│ │ │ ├── falcon.py
│ │ │ ├── fuyu.py
│ │ │ ├── gemma.py
│ │ │ ├── gemma2.py
│ │ │ ├── gemma2_embedding.py
│ │ │ ├── gpt2.py
│ │ │ ├── gpt_bigcode.py
│ │ │ ├── gpt_j.py
│ │ │ ├── gpt_neox.py
│ │ │ ├── granite.py
│ │ │ ├── granitemoe.py
│ │ │ ├── idefics2_vision_model.py
│ │ │ ├── interfaces.py
│ │ │ ├── interfaces_base.py
│ │ │ ├── intern_vit.py
│ │ │ ├── internlm2.py
│ │ │ ├── internvl.py
│ │ │ ├── jais.py
│ │ │ ├── jamba.py
│ │ │ ├── llama.py
│ │ │ ├── llama_embedding.py
│ │ │ ├── llava.py
│ │ │ ├── llava_next.py
│ │ │ ├── llava_next_video.py
│ │ │ ├── llava_onevision.py
│ │ │ ├── medusa.py
│ │ │ ├── minicpm.py
│ │ │ ├── minicpm3.py
│ │ │ ├── minicpmv.py
│ │ │ ├── mixtral.py
│ │ │ ├── mixtral_quant.py
│ │ │ ├── mllama.py
│ │ │ ├── mlp_speculator.py
│ │ │ ├── module_mapping.py
│ │ │ ├── mpt.py
│ │ │ ├── nemotron.py
│ │ │ ├── nvlm_d.py
│ │ │ ├── olmo.py
│ │ │ ├── olmoe.py
│ │ │ ├── opt.py
│ │ │ ├── orion.py
│ │ │ ├── paligemma.py
│ │ │ ├── persimmon.py
│ │ │ ├── phi.py
│ │ │ ├── phi3.py
│ │ │ ├── phi3_small.py
│ │ │ ├── phi3v.py
│ │ │ ├── phimoe.py
│ │ │ ├── pixtral.py
│ │ │ ├── qwen.py
│ │ │ ├── qwen2.py
│ │ │ ├── qwen2_moe.py
│ │ │ ├── qwen2_rm.py
│ │ │ ├── qwen2_vl.py
│ │ │ ├── registry.py
│ │ │ ├── siglip.py
│ │ │ ├── solar.py
│ │ │ ├── stablelm.py
│ │ │ ├── starcoder2.py
│ │ │ ├── ultravox.py
│ │ │ ├── utils.py
│ │ │ └── xverse.py
│ │ ├── parameter.py
│ │ ├── pooling_metadata.py
│ │ ├── sampling_metadata.py
│ │ └── utils.py
│ │ ├── multimodal
│ │ ├── __init__.py
│ │ ├── audio.py
│ │ ├── base.py
│ │ ├── image.py
│ │ ├── registry.py
│ │ ├── utils.py
│ │ └── video.py
│ │ ├── outputs.py
│ │ ├── platforms
│ │ ├── __init__.py
│ │ ├── cpu.py
│ │ ├── cuda.py
│ │ ├── interface.py
│ │ ├── rocm.py
│ │ ├── tpu.py
│ │ └── xpu.py
│ │ ├── plugins
│ │ └── __init__.py
│ │ ├── pooling_params.py
│ │ ├── prompt_adapter
│ │ ├── __init__.py
│ │ ├── layers.py
│ │ ├── models.py
│ │ ├── request.py
│ │ ├── utils.py
│ │ └── worker_manager.py
│ │ ├── py.typed
│ │ ├── sampling_params.py
│ │ ├── scalar_type.py
│ │ ├── scripts.py
│ │ ├── sequence.py
│ │ ├── spec_decode
│ │ ├── __init__.py
│ │ ├── batch_expansion.py
│ │ ├── draft_model_runner.py
│ │ ├── interfaces.py
│ │ ├── medusa_worker.py
│ │ ├── metrics.py
│ │ ├── mlp_speculator_worker.py
│ │ ├── mqa_scorer.py
│ │ ├── multi_step_worker.py
│ │ ├── ngram_worker.py
│ │ ├── proposer_worker_base.py
│ │ ├── smaller_tp_proposer_worker.py
│ │ ├── spec_decode_worker.py
│ │ ├── target_model_runner.py
│ │ ├── top1_proposer.py
│ │ └── util.py
│ │ ├── tokenformer
│ │ ├── __init__.py
│ │ └── tokenformer_model_manager.py
│ │ ├── tracing.py
│ │ ├── transformers_utils
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── configs
│ │ │ ├── __init__.py
│ │ │ ├── arctic.py
│ │ │ ├── chatglm.py
│ │ │ ├── dbrx.py
│ │ │ ├── eagle.py
│ │ │ ├── exaone.py
│ │ │ ├── falcon.py
│ │ │ ├── internvl.py
│ │ │ ├── jais.py
│ │ │ ├── medusa.py
│ │ │ ├── mllama.py
│ │ │ ├── mlp_speculator.py
│ │ │ ├── mpt.py
│ │ │ ├── nemotron.py
│ │ │ ├── nvlm_d.py
│ │ │ ├── qwen2vl.py
│ │ │ ├── solar.py
│ │ │ └── ultravox.py
│ │ ├── detokenizer.py
│ │ ├── processor.py
│ │ ├── tokenizer.py
│ │ ├── tokenizer_group
│ │ │ ├── __init__.py
│ │ │ ├── base_tokenizer_group.py
│ │ │ ├── ray_tokenizer_group.py
│ │ │ └── tokenizer_group.py
│ │ ├── tokenizers
│ │ │ ├── __init__.py
│ │ │ └── mistral.py
│ │ └── utils.py
│ │ ├── triton_utils
│ │ ├── __init__.py
│ │ ├── custom_cache_manager.py
│ │ ├── importing.py
│ │ └── libentry.py
│ │ ├── usage
│ │ ├── __init__.py
│ │ └── usage_lib.py
│ │ ├── utils.py
│ │ ├── version.py
│ │ ├── vllm_flash_attn
│ │ └── .gitkeep
│ │ └── worker
│ │ ├── __init__.py
│ │ ├── cache_engine.py
│ │ ├── cpu_enc_dec_model_runner.py
│ │ ├── cpu_model_runner.py
│ │ ├── cpu_worker.py
│ │ ├── embedding_model_runner.py
│ │ ├── enc_dec_model_runner.py
│ │ ├── model_runner.py
│ │ ├── model_runner_base.py
│ │ ├── multi_step_model_runner.py
│ │ ├── multi_step_tpu_worker.py
│ │ ├── multi_step_worker.py
│ │ ├── neuron_model_runner.py
│ │ ├── neuron_worker.py
│ │ ├── openvino_model_runner.py
│ │ ├── openvino_worker.py
│ │ ├── tpu_model_runner.py
│ │ ├── tpu_worker.py
│ │ ├── utils.py
│ │ ├── worker.py
│ │ ├── worker_base.py
│ │ ├── xpu_model_runner.py
│ │ └── xpu_worker.py
├── csrc
│ ├── activation_kernels.cu
│ ├── attention
│ │ ├── attention_dtypes.h
│ │ ├── attention_generic.cuh
│ │ ├── attention_kernels.cu
│ │ ├── attention_utils.cuh
│ │ ├── dtype_bfloat16.cuh
│ │ ├── dtype_float16.cuh
│ │ ├── dtype_float32.cuh
│ │ └── dtype_fp8.cuh
│ ├── cache.h
│ ├── cache_kernels.cu
│ ├── core
│ │ ├── exception.hpp
│ │ ├── registration.h
│ │ ├── scalar_type.hpp
│ │ └── torch_bindings.cpp
│ ├── cpu
│ │ ├── activation.cpp
│ │ ├── attention.cpp
│ │ ├── cache.cpp
│ │ ├── cpu_types.hpp
│ │ ├── cpu_types_arm.hpp
│ │ ├── cpu_types_vsx.hpp
│ │ ├── cpu_types_x86.hpp
│ │ ├── dnnl_helper.hpp
│ │ ├── layernorm.cpp
│ │ ├── pos_encoding.cpp
│ │ ├── quant.cpp
│ │ ├── torch_bindings.cpp
│ │ └── utils.cpp
│ ├── cuda_compat.h
│ ├── cuda_utils.h
│ ├── cuda_utils_kernels.cu
│ ├── custom_all_reduce.cu
│ ├── custom_all_reduce.cuh
│ ├── custom_all_reduce_test.cu
│ ├── cutlass_extensions
│ │ ├── cute_utils.cuh
│ │ ├── torch_utils.hpp
│ │ ├── vllm_collective_builder.cuh
│ │ ├── vllm_custom_types.cuh
│ │ ├── vllm_cutlass_library_extension.py
│ │ └── vllm_numeric_conversion.cuh
│ ├── dispatch_utils.h
│ ├── layernorm_kernels.cu
│ ├── mamba
│ │ ├── causal_conv1d
│ │ │ ├── causal_conv1d.cu
│ │ │ ├── causal_conv1d.h
│ │ │ └── static_switch.h
│ │ └── mamba_ssm
│ │ │ ├── selective_scan.h
│ │ │ ├── selective_scan_fwd.cu
│ │ │ └── static_switch.h
│ ├── moe
│ │ ├── marlin_kernels
│ │ │ ├── marlin_moe_kernel.h
│ │ │ ├── marlin_moe_kernel_ku4.cu
│ │ │ ├── marlin_moe_kernel_ku4.h
│ │ │ ├── marlin_moe_kernel_ku4b8.cu
│ │ │ ├── marlin_moe_kernel_ku4b8.h
│ │ │ ├── marlin_moe_kernel_ku8b128.cu
│ │ │ └── marlin_moe_kernel_ku8b128.h
│ │ ├── marlin_moe_ops.cu
│ │ ├── moe_ops.h
│ │ ├── topk_softmax_kernels.cu
│ │ └── torch_bindings.cpp
│ ├── moe_align_block_size_kernels.cu
│ ├── ops.h
│ ├── permute_cols.cu
│ ├── pos_encoding_kernels.cu
│ ├── prepare_inputs
│ │ ├── advance_step.cu
│ │ └── advance_step.cuh
│ ├── quantization
│ │ ├── aqlm
│ │ │ └── gemm_kernels.cu
│ │ ├── awq
│ │ │ ├── dequantize.cuh
│ │ │ └── gemm_kernels.cu
│ │ ├── compressed_tensors
│ │ │ └── int8_quant_kernels.cu
│ │ ├── cutlass_w8a8
│ │ │ ├── Epilogues.md
│ │ │ ├── broadcast_load_epilogue_c2x.hpp
│ │ │ ├── broadcast_load_epilogue_c3x.hpp
│ │ │ ├── common.hpp
│ │ │ ├── scaled_mm_c2x.cu
│ │ │ ├── scaled_mm_c2x.cuh
│ │ │ ├── scaled_mm_c2x_sm75_dispatch.cuh
│ │ │ ├── scaled_mm_c2x_sm80_dispatch.cuh
│ │ │ ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh
│ │ │ ├── scaled_mm_c2x_sm89_int8_dispatch.cuh
│ │ │ ├── scaled_mm_c3x.cu
│ │ │ └── scaled_mm_entry.cu
│ │ ├── fp8
│ │ │ ├── amd
│ │ │ │ ├── hip_float8.h
│ │ │ │ ├── hip_float8_impl.h
│ │ │ │ └── quant_utils.cuh
│ │ │ ├── common.cu
│ │ │ ├── fp8_marlin.cu
│ │ │ └── nvidia
│ │ │ │ └── quant_utils.cuh
│ │ ├── gguf
│ │ │ ├── dequantize.cuh
│ │ │ ├── ggml-common.h
│ │ │ ├── gguf_kernel.cu
│ │ │ ├── mmq.cuh
│ │ │ ├── mmvq.cuh
│ │ │ └── vecdotq.cuh
│ │ ├── gptq
│ │ │ ├── compat.cuh
│ │ │ ├── matrix_view.cuh
│ │ │ ├── q_gemm.cu
│ │ │ ├── qdq_2.cuh
│ │ │ ├── qdq_3.cuh
│ │ │ ├── qdq_4.cuh
│ │ │ ├── qdq_8.cuh
│ │ │ └── qdq_util.cuh
│ │ ├── gptq_marlin
│ │ │ ├── awq_marlin_repack.cu
│ │ │ ├── gptq_marlin.cu
│ │ │ ├── gptq_marlin_repack.cu
│ │ │ ├── marlin.cuh
│ │ │ └── marlin_dtypes.cuh
│ │ ├── machete
│ │ │ ├── Readme.md
│ │ │ ├── generate.py
│ │ │ ├── machete_collective_builder.cuh
│ │ │ ├── machete_interleaving_utils.cuh
│ │ │ ├── machete_mainloop.cuh
│ │ │ ├── machete_mm_kernel.cuh
│ │ │ ├── machete_mm_launcher.cuh
│ │ │ ├── machete_prepack_kernel.cuh
│ │ │ ├── machete_prepack_launcher.cuh
│ │ │ ├── machete_prepacked_layout.cuh
│ │ │ └── machete_pytorch.cu
│ │ └── marlin
│ │ │ ├── dense
│ │ │ ├── LICENSE
│ │ │ ├── common
│ │ │ │ ├── base.h
│ │ │ │ └── mem.h
│ │ │ └── marlin_cuda_kernel.cu
│ │ │ ├── qqq
│ │ │ └── marlin_qqq_gemm_kernel.cu
│ │ │ └── sparse
│ │ │ ├── LICENSE
│ │ │ ├── common
│ │ │ ├── base.h
│ │ │ ├── mem.h
│ │ │ └── mma.h
│ │ │ └── marlin_24_cuda_kernel.cu
│ ├── rocm
│ │ ├── attention.cu
│ │ ├── ops.h
│ │ └── torch_bindings.cpp
│ └── torch_bindings.cpp
├── requirements-vllm-build.txt
├── requirements-vllm.txt
├── setup.py
├── slurm_configs
│ ├── cgroup.conf
│ ├── gres.conf
│ ├── munge.key
│ ├── slurm.conf
│ └── slurm.key
├── slurm_src
│ ├── cgroup_docker.c
│ └── compile.sh
└── util
│ └── plot_training.py
├── ml
├── cray_megatron
│ ├── collectives
│ │ ├── data_parallelism.py
│ │ └── main_rank_only.py
│ ├── huggingface
│ │ └── download_model.py
│ ├── main.py
│ ├── megatron
│ │ ├── dataset
│ │ │ ├── data_loader.py
│ │ │ └── load_dataset.py
│ │ ├── distribution
│ │ │ └── apply_distribution_strategy.py
│ │ ├── megatron_trainer.py
│ │ └── training_loop.py
│ └── models
│ │ ├── does_any_checkpoint_exist.py
│ │ ├── get_latest_checkpoint_path.py
│ │ ├── get_model_manager.py
│ │ ├── model_manager_base.py
│ │ └── tokenformer
│ │ ├── load_tokenformer_model.py
│ │ └── tokenformer_model_manager.py
└── tokenformer
│ ├── llama_tokenformer_layers.py
│ ├── llama_tokenformer_model.py
│ ├── tokenformer_surgeon.py
│ └── transformers_tokenformer.py
├── requirements.txt
├── scalarlm
├── scripts
├── cray
├── start_one_server.sh
├── start_slurm.sh
└── train_job_entrypoint.sh
├── sdk
├── masint
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── async_supermassive_intelligence.py
│ │ └── supermassive_intelligence.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── ls.py
│ │ ├── main.py
│ │ ├── plot.py
│ │ ├── squeue.py
│ │ └── view_logs.py
│ ├── engines
│ │ ├── __init__.py
│ │ ├── async_cray.py
│ │ └── cray
│ │ │ ├── __init__.py
│ │ │ └── submit_training_job.py
│ └── util
│ │ ├── __init__.py
│ │ ├── get_api_base.py
│ │ └── make_api_url.py
├── pyproject.toml
└── scalarlm
│ └── __init__.py
└── test
├── benchmark
├── main.py
├── pytorch
│ ├── backward.py
│ ├── forward.py
│ ├── gemm.py
│ ├── memcpy.py
│ ├── memcpy_peer.py
│ └── mpi_p2p.py
└── roofline
│ ├── plot_bandwidth_sweep.py
│ └── plot_roofline.py
├── deployment
├── embed.py
├── generate.py
├── health.py
├── train.py
└── train_generate.py
├── infra
├── distribution_strategy
│ ├── benchmark_mpi_collectives.py
│ ├── benchmark_mpi_sendrecv.py
│ └── test_fsdp.py
├── generate.py
├── get_results.py
├── health.py
├── openai_client.py
├── sanity.py
├── slurm.py
├── upload_dataset.py
├── vllm
│ └── tokenformer
│ │ └── test_tokenformer.py
└── vllm_health.py
├── ml
├── rl
│ ├── cs_semester.sqlite
│ ├── mini-bird.json
│ └── sql-reasoning.py
├── sql
│ ├── data.json
│ ├── train.py
│ └── train_generate.py
└── tokenformer
│ ├── test_llama_tokenformer_model.py
│ ├── test_tokenformer.py
│ └── test_tokenformer_surgeon.py
└── requirements-pytest.txt
/.github/workflows/depot-amd.yml:
--------------------------------------------------------------------------------
1 | name: Build AMD image using depot
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 |
7 | jobs:
8 | docker-image:
9 | runs-on: depot-ubuntu-24.04-8
10 | steps:
11 | - name: Checkout repo
12 | uses: actions/checkout@v3
13 |
14 | - name: Set up Depot CLI
15 | uses: depot/setup-action@v1
16 |
17 | - name: Login to DockerHub
18 | uses: docker/login-action@v2
19 | with:
20 | username: ${{ secrets.DOCKERHUB_USERNAME }}
21 | password: ${{ secrets.DOCKERHUB_TOKEN }}
22 |
23 | - name: Build and push
24 | uses: depot/build-push-action@v1
25 | env:
26 | DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }}
27 | with:
28 | # if no depot.json file is at the root of your repo, you must specify the project id
29 | project: 39xfdrxfqt
30 | push: true
31 | tags: tensorwave/scalarlm-amd:latest
32 | build-args: |
33 | BASE_NAME=amd
34 | VLLM_TARGET_DEVICE=rocm
35 | PYTORCH_ROCM_ARCH=gfx90a;gfx942
36 | MAX_JOBS=8
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/.github/workflows/depot-cpu.yml:
--------------------------------------------------------------------------------
1 | name: Build CPU image using depot
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 |
7 | jobs:
8 | docker-image:
9 | runs-on: depot-ubuntu-22.04-8
10 | steps:
11 | - name: Checkout repo
12 | uses: actions/checkout@v3
13 |
14 | - name: Set up Depot CLI
15 | uses: depot/setup-action@v1
16 |
17 | - name: Login to DockerHub
18 | uses: docker/login-action@v2
19 | with:
20 | username: ${{ secrets.DOCKERHUB_USERNAME }}
21 | password: ${{ secrets.DOCKERHUB_TOKEN }}
22 |
23 | - name: Build and push
24 | uses: depot/build-push-action@v1
25 | env:
26 | DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }}
27 | with:
28 | # if no depot.json file is at the root of your repo, you must specify the project id
29 | project: 39xfdrxfqt
30 | push: true
31 | tags: tensorwave/scalarlm-cpu:latest
32 | build-args: |
33 | BASE_NAME=cpu
34 | VLLM_TARGET_DEVICE=cpu
35 | TORCH_CUDA_ARCH_LIST=""
36 | MAX_JOBS=8
37 |
38 |
39 |
--------------------------------------------------------------------------------
/.github/workflows/depot-nvidia-8.0.yml:
--------------------------------------------------------------------------------
1 | name: Build NVIDIA CUDA 8.0 image using depot
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 |
7 | jobs:
8 | docker-image:
9 | runs-on: depot-ubuntu-22.04
10 | steps:
11 | - name: Checkout repo
12 | uses: actions/checkout@v3
13 |
14 | - name: Set up Depot CLI
15 | uses: depot/setup-action@v1
16 |
17 | - name: Login to DockerHub
18 | uses: docker/login-action@v2
19 | with:
20 | username: ${{ secrets.DOCKERHUB_USERNAME }}
21 | password: ${{ secrets.DOCKERHUB_TOKEN }}
22 |
23 | - name: Build and push
24 | uses: depot/build-push-action@v1
25 | env:
26 | DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }}
27 | with:
28 | # if no depot.json file is at the root of your repo, you must specify the project id
29 | project: 39xfdrxfqt
30 | push: true
31 | tags: gdiamos/scalarlm-nvidia-8.0:latest
32 | build-args: |
33 | BASE_NAME=nvidia
34 | VLLM_TARGET_DEVICE=cuda
35 | TORCH_CUDA_ARCH_LIST=8.0
36 | MAX_JOBS=2
37 |
38 |
39 |
--------------------------------------------------------------------------------
/.github/workflows/depot-nvidia-8.6.yml:
--------------------------------------------------------------------------------
1 | name: Build NVIDIA CUDA 8.6 image using depot
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 |
7 | jobs:
8 | docker-image:
9 | runs-on: depot-ubuntu-22.04
10 | steps:
11 | - name: Checkout repo
12 | uses: actions/checkout@v3
13 |
14 | - name: Set up Depot CLI
15 | uses: depot/setup-action@v1
16 |
17 | - name: Login to DockerHub
18 | uses: docker/login-action@v2
19 | with:
20 | username: ${{ secrets.DOCKERHUB_USERNAME }}
21 | password: ${{ secrets.DOCKERHUB_TOKEN }}
22 |
23 | - name: Build and push
24 | uses: depot/build-push-action@v1
25 | env:
26 | DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }}
27 | with:
28 | # if no depot.json file is at the root of your repo, you must specify the project id
29 | project: 39xfdrxfqt
30 | push: true
31 | tags: tensorwave/scalarlm-nvidia-8.6:latest
32 | build-args: |
33 | BASE_NAME=nvidia
34 | VLLM_TARGET_DEVICE=cuda
35 | TORCH_CUDA_ARCH_LIST=8.6
36 | MAX_JOBS=2
37 |
38 |
39 |
--------------------------------------------------------------------------------
/.github/workflows/depot-nvidia.yml:
--------------------------------------------------------------------------------
1 | name: Build NVIDIA image using depot
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 |
7 | jobs:
8 | docker-image:
9 | runs-on: depot-ubuntu-22.04-8
10 | steps:
11 | - name: Checkout repo
12 | uses: actions/checkout@v3
13 |
14 | - name: Set up Depot CLI
15 | uses: depot/setup-action@v1
16 |
17 | - name: Login to DockerHub
18 | uses: docker/login-action@v2
19 | with:
20 | username: ${{ secrets.DOCKERHUB_USERNAME }}
21 | password: ${{ secrets.DOCKERHUB_TOKEN }}
22 |
23 | - name: Build and push
24 | uses: depot/build-push-action@v1
25 | env:
26 | DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }}
27 | with:
28 | # if no depot.json file is at the root of your repo, you must specify the project id
29 | project: 39xfdrxfqt
30 | push: true
31 | tags: tensorwave/scalarlm-nvidia:latest
32 | build-args: |
33 | BASE_NAME=nvidia
34 | VLLM_TARGET_DEVICE=cuda
35 | TORCH_CUDA_ARCH_LIST=7.0 7.5 8.0 8.6 8.9 9.0
36 | MAX_JOBS=8
37 |
38 |
39 |
--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
1 | name: Build and run unit tests
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 |
7 | jobs:
8 | docker-image:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Checkout repo
12 | uses: actions/checkout@v3
13 |
14 | - name: Run tests
15 | run: >
16 | ./cray test
17 |
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/*.swp
2 | **/*.so
3 | models/*
4 | infra/slurm_configs/slurm.conf
5 | scripts/cray
6 |
7 | *.DS_Store
8 | **/__pycache__/
9 | .env
10 | .idea
11 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/psf/black
3 | rev: 24.10.0
4 | hooks:
5 | - id: black
6 |
7 |
--------------------------------------------------------------------------------
/cmd/bashly.sh:
--------------------------------------------------------------------------------
1 | # e exit on first failure
2 | # x all executed commands are printed to the terminal
3 | # u unset variables are errors
4 | # a export all variables to the environment
5 | # E any trap on ERR is inherited by shell functions
6 | # -o pipefail | produces a failure code if any stage fails
7 | set -Eeuoxa pipefail
8 |
9 | # Get the directory of this script
10 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
11 |
12 | TTY=-t
13 | if test -t 0; then
14 | TTY=-it
15 | fi
16 |
17 | # Run the docker container
18 | docker run --rm $TTY --user $(id -u):$(id -g) \
19 | --volume "$LOCAL_DIRECTORY:/app/cmd" \
20 | --volume "$LOCAL_DIRECTORY/../scripts:/app/scripts" \
21 | --volume "$LOCAL_DIRECTORY/bashly-settings.yml:/app/bashly-settings.yml" \
22 | dannyben/bashly "$@"
23 |
--------------------------------------------------------------------------------
/cmd/benchmark_command.sh:
--------------------------------------------------------------------------------
1 | inspect_args
2 |
3 | target=${args[target]}
4 | visible_gpus=${args[visible-gpus]}
5 |
6 | ./cray build-image $target
7 |
8 | declare -a benchmark_command_parts
9 | benchmark_command_parts=(
10 | "CUDA_VISIBLE_DEVICES=${visible_gpus}" "python" "/app/cray/test/benchmark/main.py"
11 | )
12 |
13 | benchmark_command="${benchmark_command_parts[*]}"
14 |
15 | echo $command
16 |
17 | # Get the directory of this script
18 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
19 |
20 | # Set cwd to the project root directory
21 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/..
22 |
23 | declare -a docker_command_parts
24 |
25 | # Make sure the data directory exists
26 | mkdir -p $ROOT_DIRECTORY/data
27 |
28 | docker_command_parts=("docker" "run" "-it" "--rm" "--network" "host" "-v" "$ROOT_DIRECTORY/data:/app/cray/data")
29 |
30 | declare -a gpu_options
31 |
32 | # Set the GPU options depending on the target
33 | if [ "$target" == "cpu" ]; then
34 | gpu_options+=()
35 | elif [ "$target" == "amd" ]; then
36 | gpu_options+=("--device" "/dev/kfd" "--device" "/dev/dri")
37 | else
38 | gpu_options+=("--gpus" "all")
39 | fi
40 |
41 | docker_command_parts+=("${gpu_options[@]}")
42 | docker_command_parts+=("cray:latest" "sh" "-c" "'$benchmark_command'")
43 |
44 | docker_command="${docker_command_parts[*]}"
45 | echo $docker_command
46 | eval $docker_command
47 |
48 |
--------------------------------------------------------------------------------
/cmd/build_image_command.sh:
--------------------------------------------------------------------------------
1 | inspect_args
2 |
3 | target=${args[target]}
4 |
5 | declare -a vllm_target_device
6 | declare -a docker_platform
7 |
8 | # If target is cpu, build the image with the cpu base image
9 | if [ "$target" == "cpu" ]; then
10 | vllm_target_device=("cpu")
11 | if [ "$(uname -m)" == "x86_64" ]; then
12 | docker_platform=("linux/amd64")
13 | else
14 | docker_platform=("linux/arm64/v8")
15 | fi
16 | elif [ "$target" == "amd" ]; then
17 | vllm_target_device=("rocm")
18 | docker_platform=("linux/amd64")
19 | else
20 | vllm_target_device=("cuda")
21 | docker_platform=("linux/amd64")
22 | fi
23 |
24 | docker_build_command="docker build --platform ${docker_platform} --build-arg BASE_NAME=${target} --build-arg VLLM_TARGET_DEVICE=${vllm_target_device} -t cray:latest --shm-size=8g ."
25 |
26 | # Run docker build command
27 | echo $(green_bold Building image with command: ${docker_build_command})
28 | eval $docker_build_command
29 |
30 | echo $(green_bold Successfully built image)
31 |
--------------------------------------------------------------------------------
/cmd/depot_build_command.sh:
--------------------------------------------------------------------------------
1 | inspect_args
2 |
3 | target=${args[target]}
4 |
5 | declare -a vllm_target_device
6 | declare -a docker_platform
7 |
8 | # If target is cpu, build the image with the cpu base image
9 | if [ "$target" == "cpu" ]; then
10 | vllm_target_device=("cpu")
11 | docker_platform=("linux/amd64")
12 | elif [ "$target" == "arm" ]; then
13 | vllm_target_device=("cpu")
14 | docker_platform=("linux/arm64/v8")
15 | elif [ "$target" == "amd" ]; then
16 | vllm_target_device=("rocm")
17 | docker_platform=("linux/amd64")
18 | else
19 | vllm_target_device=("cuda")
20 | docker_platform=("linux/amd64")
21 | fi
22 |
23 | docker_build_command="depot build --platform ${docker_platform} --build-arg BASE_NAME=${target} --build-arg VLLM_TARGET_DEVICE=${vllm_target_device} -t gdiamos/cray-${target}:latest --push ."
24 |
25 | # Run docker build command
26 | echo $(green_bold Building image with command: ${docker_build_command})
27 | eval $docker_build_command
28 |
29 | echo $(green_bold Successfully built image)
30 |
31 |
--------------------------------------------------------------------------------
/cmd/llm_logs_command.sh:
--------------------------------------------------------------------------------
1 | inspect_args
2 |
3 | model=${args[model]}
4 | tail=${args[--tail]}
5 | lines=${args[--lines]}
6 | follow=${args[--follow]}
7 |
8 | if [ -z "$model" ]; then
9 | model="latest"
10 | fi
11 |
12 | ./cray build-image
13 |
14 | declare -a log_command_parts
15 | log_command_parts=(
16 | "python" "/app/cray/sdk/masint/cli/main.py" "logs" "--model" "$model" "--lines" "$lines"
17 | )
18 |
19 | echo $tail
20 |
21 | # If tail exists, add it to the command
22 | if [ -n "$tail" ]; then
23 | log_command_parts+=("--tail")
24 | fi
25 |
26 | # If follow exists, add it to the command
27 | if [ -n "$follow" ]; then
28 | log_command_parts+=("--follow")
29 | fi
30 |
31 | log_command="${log_command_parts[*]}"
32 |
33 | echo $command
34 |
35 | declare -a docker_command_parts
36 |
37 | docker_command_parts=("docker" "run" "-it" "--rm" "--network" "host")
38 |
39 | docker_command_parts+=("cray:latest" "sh" "-c" "'$log_command'")
40 |
41 | docker_command="${docker_command_parts[*]}"
42 | echo $docker_command
43 | eval $docker_command
44 |
45 |
46 |
--------------------------------------------------------------------------------
/cmd/llm_ls_command.sh:
--------------------------------------------------------------------------------
1 | inspect_args
2 |
3 | ./cray build-image
4 |
5 | declare -a ls_command_parts
6 | ls_command_parts=(
7 | "python" "/app/cray/sdk/masint/cli/main.py" "ls"
8 | )
9 |
10 | ls_command="${ls_command_parts[*]}"
11 |
12 | echo $command
13 |
14 | # Get the directory of this script
15 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
16 |
17 | # Set cwd to the project root directory
18 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/..
19 |
20 | declare -a docker_command_parts
21 |
22 | # Make sure the data directory exists
23 | mkdir -p $ROOT_DIRECTORY/data
24 |
25 | docker_command_parts=("docker" "run" "--rm" "--network" "host")
26 |
27 | docker_command_parts+=("cray:latest" "sh" "-c" "'$ls_command'")
28 |
29 | docker_command="${docker_command_parts[*]}"
30 | echo $docker_command
31 | eval $docker_command
32 |
33 |
--------------------------------------------------------------------------------
/cmd/llm_plot_command.sh:
--------------------------------------------------------------------------------
1 | inspect_args
2 |
3 | model=${args[model]}
4 |
5 | if [ -z "$model" ]; then
6 | model="latest"
7 | fi
8 |
9 | ./cray build-image
10 |
11 | declare -a plot_command_parts
12 | plot_command_parts=(
13 | "python" "/app/cray/sdk/masint/cli/main.py" "plot" "--model" "$model"
14 | )
15 |
16 | plot_command="${plot_command_parts[*]}"
17 |
18 | echo $command
19 |
20 | # Get the directory of this script
21 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
22 |
23 | # Set cwd to the project root directory
24 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/..
25 |
26 | declare -a docker_command_parts
27 |
28 | # Make sure the data directory exists
29 | mkdir -p $ROOT_DIRECTORY/data
30 |
31 | docker_command_parts=("docker" "run" "--rm" "-v" "$ROOT_DIRECTORY/data:/app/cray/data" "--network" "host")
32 |
33 | docker_command_parts+=("cray:latest" "sh" "-c" "'$plot_command'")
34 |
35 | docker_command="${docker_command_parts[*]}"
36 | echo $docker_command
37 | eval $docker_command
38 |
39 |
--------------------------------------------------------------------------------
/cmd/llm_squeue_command.sh:
--------------------------------------------------------------------------------
1 | inspect_args
2 |
3 | ./cray build-image
4 |
5 | declare -a squeue_command_parts
6 | squeue_command_parts=(
7 | "python" "/app/cray/sdk/masint/cli/main.py" "squeue"
8 | )
9 |
10 | squeue_command="${squeue_command_parts[*]}"
11 |
12 | echo $command
13 |
14 | # Get the directory of this script
15 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
16 |
17 | # Set cwd to the project root directory
18 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/..
19 |
20 | declare -a docker_command_parts
21 |
22 | # Make sure the data directory exists
23 | mkdir -p $ROOT_DIRECTORY/data
24 |
25 | docker_command_parts=("docker" "run" "--rm" "--network" "host")
26 |
27 | docker_command_parts+=("cray:latest" "sh" "-c" "'$squeue_command'")
28 |
29 | docker_command="${docker_command_parts[*]}"
30 | echo $docker_command
31 | eval $docker_command
32 |
33 |
34 |
--------------------------------------------------------------------------------
/cmd/pypi_command.sh:
--------------------------------------------------------------------------------
1 | inspect_args
2 |
3 | # Get the directory of this script
4 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
5 |
6 | # Set cwd to the project sdk directory
7 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/../sdk
8 |
9 | cd $ROOT_DIRECTORY
10 |
11 | # Build sdk wheel from sdk/pyproject.toml
12 | wheel_build_command="python -m build --sdist --wheel --outdir dist/ ."
13 |
14 | # Run sdk wheel build
15 | echo $(green_bold Building wheel with command: ${wheel_build_command})
16 | eval $wheel_build_command
17 |
18 | echo $(green_bold Successfully built wheel)
19 |
20 | # Upload wheel to pypi
21 | pypi_upload_command="twine upload dist/*"
22 |
23 | # Run pypi upload command
24 | echo $(green_bold Uploading wheel to pypi with command: ${pypi_upload_command})
25 | eval $pypi_upload_command
26 |
27 | echo $(green_bold Successfully uploaded wheel to pypi)
28 |
29 |
--------------------------------------------------------------------------------
/cmd/up_command.sh:
--------------------------------------------------------------------------------
1 | inspect_args
2 |
3 | target=${args[target]}
4 |
5 | declare -a vllm_target_device
6 | declare -a docker_compose_service
7 |
8 | if [ "$target" == "cpu" ]; then
9 | vllm_target_device=("cpu")
10 | docker_compose_service="cray"
11 | elif [ "$target" == "amd" ]; then
12 | vllm_target_device=("rocm")
13 | docker_compose_service="cray-amd"
14 | else
15 | vllm_target_device=("cuda")
16 | docker_compose_service="cray-nvidia"
17 | fi
18 |
19 | BASE_NAME=${target} VLLM_TARGET_DEVICE=${vllm_target_device} docker compose -f docker-compose.yaml up ${docker_compose_service} --build --force-recreate
20 |
--------------------------------------------------------------------------------
/deployment/ansible/hosts:
--------------------------------------------------------------------------------
1 | ini
2 | [localhost]
3 | localhost ansible_connection=local
4 |
--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the ScalarLM service
4 | name: scalarlm
5 | version: 1.0.0
6 |
--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | gotemplate
2 | {{- define "scalarlm.fullname" -}}
3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
4 | {{- end -}}
5 |
6 | {{- define "scalarlm.vllmname" -}}
7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}}
8 | {{- end -}}
9 |
10 | {{- define "scalarlm.labels" -}}
11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }}
12 | app.kubernetes.io/instance: {{ .Release.Name }}
13 | {{- end -}}
14 |
15 | {{- define "scalarlm.vllmlabels" -}}
16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }}
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | {{- end -}}
19 |
--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/templates/api_configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/api_configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-api-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}"
12 | server_list: api
13 | max_train_time: {{ .Values.max_train_time }}
14 |
15 |
16 |
--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/templates/api_service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "scalarlm.fullname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.api_port }}
9 | targetPort: 8000
10 | protocol: TCP
11 | name: http
12 | externalIPs:
13 | - {{ .Values.service.externalIP }}
14 | selector:
15 | {{- include "scalarlm.labels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/templates/vllm_configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-vllm-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | api_url: "http://scalarlm:{{ .Values.service.api_port }}"
12 | server_list: vllm
13 |
14 |
15 |
--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/templates/vllm_service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "scalarlm.vllmname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.vllm_port }}
9 | targetPort: 8001
10 | protocol: TCP
11 | name: http
12 | externalIPs:
13 | - {{ .Values.service.externalIP }}
14 | selector:
15 | {{- include "scalarlm.vllmlabels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the ScalarLM service
4 | name: scalarlm
5 | version: 1.0.0
6 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/local-hostpath-sc.yaml:
--------------------------------------------------------------------------------
1 | # local-hostpath-sc.yaml
2 | apiVersion: storage.k8s.io/v1
3 | kind: StorageClass
4 | metadata:
5 | name: local-hostpath
6 | provisioner: kubernetes.io/no-provisioner
7 | volumeBindingMode: Immediate
8 |
9 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/storageclass-clusterrole.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRole
3 | metadata:
4 | name: storageclass-manager
5 | rules:
6 | - apiGroups: ["storage.k8s.io"]
7 | resources: ["storageclasses"]
8 | verbs: ["get", "list", "create", "delete", "patch", "update"]
9 |
10 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | gotemplate
2 | {{- define "scalarlm.fullname" -}}
3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
4 | {{- end -}}
5 |
6 | {{- define "scalarlm.vllmname" -}}
7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}}
8 | {{- end -}}
9 |
10 | {{- define "scalarlm.labels" -}}
11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }}
12 | app.kubernetes.io/instance: {{ .Release.Name }}
13 | {{- end -}}
14 |
15 | {{- define "scalarlm.vllmlabels" -}}
16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }}
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | {{- end -}}
19 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/api_configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/api_configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-api-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}"
12 | server_list: api
13 | max_train_time: {{ .Values.max_train_time }}
14 |
15 |
16 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/api_service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "scalarlm.fullname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.api_port }}
9 | targetPort: 8000
10 | protocol: TCP
11 | name: http
12 | externalIPs:
13 | - {{ .Values.service.externalIP }}
14 | selector:
15 | {{- include "scalarlm.labels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/cache_pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: scalarlm-cache
5 | annotations:
6 | helm.sh/resource-policy: keep
7 | spec:
8 | accessModes:
9 | - ReadWriteOnce
10 | resources:
11 | requests:
12 | storage: {{ .Values.cache_pvc.size }}
13 | storageClassName: {{ .Values.cache_pvc.storageClass }}
14 | wait_until_bound: false
15 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/jobs_pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: scalarlm-jobs
5 | spec:
6 | accessModes:
7 | - ReadWriteOnce
8 | resources:
9 | requests:
10 | storage: {{ .Values.jobs_pvc.size }}
11 | storageClassName: {{ .Values.jobs_pvc.storageClass }}
12 | wait_until_bound: false
13 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/vllm_configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-vllm-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | api_url: "http://scalarlm:{{ .Values.service.api_port }}"
12 | server_list: vllm
13 |
14 |
15 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/vllm_service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "scalarlm.vllmname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.vllm_port }}
9 | targetPort: 8001
10 | protocol: TCP
11 | name: http
12 | externalIPs:
13 | - {{ .Values.service.externalIP }}
14 | selector:
15 | {{- include "scalarlm.vllmlabels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/values.yaml:
--------------------------------------------------------------------------------
1 | image:
2 | repository: sudnya/scalarlm-rocm
3 | tag: v0.7
4 | pullPolicy: Always
5 |
6 | env:
7 | - name: HIP_VISIBLE_DEVICES
8 | value: "0"
9 | - name: ROCR_VISIBLE_DEVICES
10 | value: "0"
11 |
12 | service:
13 | type: ClusterIP
14 | api_port: 8000
15 | vllm_port: 8001
16 | externalIP: 10.1.81.248
17 |
18 | jobs_pvc:
19 | storageClass: openebs-hostpath
20 | size: 100Gi
21 |
22 | cache_pvc:
23 | storageClass: openebs-hostpath
24 | size: 32Gi
25 |
26 | model: meta-llama/Llama-3.1-8B-Instruct
27 | max_model_length: 4096
28 | gpu_memory_utilization: 0.95
29 |
30 | training_gpus: 2
31 | inference_gpus: 1
32 |
33 | max_train_time: 86400
34 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_pod/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the Cray service
4 | name: cray
5 | version: 1.0.0
6 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_pod/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | gotemplate
2 | {{- define "cray.fullname" -}}
3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
4 | {{- end -}}
5 |
6 | {{- define "cray.labels" -}}
7 | app.kubernetes.io/name: {{ include "cray.fullname" . }}
8 | app.kubernetes.io/instance: {{ .Release.Name }}
9 | {{- end -}}
10 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_pod/scalarlm/templates/configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |
12 |
13 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_pod/scalarlm/templates/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "cray.fullname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.port }}
9 | targetPort: {{ .Values.service.targetPort }}
10 | protocol: TCP
11 | name: http
12 | - port: 8001
13 | targetPort: 8001
14 | protocol: TCP
15 | name: http2
16 | externalIPs:
17 | - {{ .Values.service.externalIP }}
18 | selector:
19 | {{- include "cray.labels" . | nindent 4 }}
20 |
--------------------------------------------------------------------------------
/deployment/helm/amd_single_pod/scalarlm/values.yaml:
--------------------------------------------------------------------------------
1 | # Default values for cray-chart.
2 | # This is a YAML-formatted file.
3 | # Declare variables to be passed into your templates.
4 |
5 | replicaCount: 1
6 |
7 | image:
8 | repository: sudnya/scalarlm-rocm
9 | tag: latest
10 | pullPolicy: Always
11 |
12 | env:
13 | - name: HIP_VISIBLE_DEVICES
14 | value: "0"
15 | - name: ROCR_VISIBLE_DEVICES
16 | value: "0"
17 | service:
18 | type: ClusterIP
19 | port: 8000
20 | targetPort: 8000
21 | externalIP: 10.1.81.248
22 |
23 | model: meta-llama/Llama-3.1-8B-Instruct
24 | max_model_length: 4096
25 | gpu_memory_utilization: 0.33
26 |
--------------------------------------------------------------------------------
/deployment/helm/cray/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 |
--------------------------------------------------------------------------------
/deployment/helm/cray/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: cray
3 | description: A Helm chart for Kubernetes
4 |
5 | # A chart can be either an 'application' or a 'library' chart.
6 | #
7 | # Application charts are a collection of templates that can be packaged into versioned archives
8 | # to be deployed.
9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 |
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.1.0
19 |
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | #appVersion: "1.16.0"
25 |
--------------------------------------------------------------------------------
/deployment/helm/cray/templates/hpa.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.autoscaling.enabled }}
2 | apiVersion: autoscaling/v2
3 | kind: HorizontalPodAutoscaler
4 | metadata:
5 | name: {{ include "cray.fullname" . }}
6 | labels:
7 | {{- include "cray.labels" . | nindent 4 }}
8 | spec:
9 | scaleTargetRef:
10 | apiVersion: apps/v1
11 | kind: Deployment
12 | name: {{ include "cray.fullname" . }}
13 | minReplicas: {{ .Values.autoscaling.minReplicas }}
14 | maxReplicas: {{ .Values.autoscaling.maxReplicas }}
15 | metrics:
16 | {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
17 | - type: Resource
18 | resource:
19 | name: cpu
20 | target:
21 | type: Utilization
22 | averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
23 | {{- end }}
24 | {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
25 | - type: Resource
26 | resource:
27 | name: memory
28 | target:
29 | type: Utilization
30 | averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
31 | {{- end }}
32 | {{- end }}
33 |
--------------------------------------------------------------------------------
/deployment/helm/cray/templates/ingress.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.ingress.enabled -}}
2 | apiVersion: networking.k8s.io/v1
3 | kind: Ingress
4 | metadata:
5 | name: {{ include "cray.fullname" . }}
6 | labels:
7 | {{- include "cray.labels" . | nindent 4 }}
8 | {{- with .Values.ingress.annotations }}
9 | annotations:
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | spec:
13 | {{- with .Values.ingress.className }}
14 | ingressClassName: {{ . }}
15 | {{- end }}
16 | {{- if .Values.ingress.tls }}
17 | tls:
18 | {{- range .Values.ingress.tls }}
19 | - hosts:
20 | {{- range .hosts }}
21 | - {{ . | quote }}
22 | {{- end }}
23 | secretName: {{ .secretName }}
24 | {{- end }}
25 | {{- end }}
26 | rules:
27 | {{- range .Values.ingress.hosts }}
28 | - host: {{ .host | quote }}
29 | http:
30 | paths:
31 | {{- range .paths }}
32 | - path: {{ .path }}
33 | {{- with .pathType }}
34 | pathType: {{ . }}
35 | {{- end }}
36 | backend:
37 | service:
38 | name: {{ include "cray.fullname" $ }}
39 | port:
40 | number: {{ $.Values.service.port }}
41 | {{- end }}
42 | {{- end }}
43 | {{- end }}
44 |
--------------------------------------------------------------------------------
/deployment/helm/cray/templates/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "cray.fullname" . }}
5 | labels:
6 | {{- include "cray.labels" . | nindent 4 }}
7 | spec:
8 | type: {{ .Values.service.type }}
9 | ports:
10 | - port: {{ .Values.service.port }}
11 | targetPort: http
12 | protocol: TCP
13 | name: http
14 | selector:
15 | {{- include "cray.selectorLabels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/cray/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.serviceAccount.create -}}
2 | apiVersion: v1
3 | kind: ServiceAccount
4 | metadata:
5 | name: {{ include "cray.serviceAccountName" . }}
6 | labels:
7 | {{- include "cray.labels" . | nindent 4 }}
8 | {{- with .Values.serviceAccount.annotations }}
9 | annotations:
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
13 | {{- end }}
14 |
--------------------------------------------------------------------------------
/deployment/helm/cray/templates/tests/test-connection.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: "{{ include "cray.fullname" . }}-test-connection"
5 | labels:
6 | {{- include "cray.labels" . | nindent 4 }}
7 | annotations:
8 | "helm.sh/hook": test
9 | spec:
10 | containers:
11 | - name: wget
12 | image: busybox
13 | command: ['wget']
14 | args: ['{{ include "cray.fullname" . }}:{{ .Values.service.port }}']
15 | restartPolicy: Never
16 |
--------------------------------------------------------------------------------
/deployment/helm/lambda/cray/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the Cray service
4 | name: cray
5 | version: 1.0.0
6 |
--------------------------------------------------------------------------------
/deployment/helm/lambda/cray/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | gotemplate
2 | {{- define "cray.fullname" -}}
3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
4 | {{- end -}}
5 |
6 | {{- define "cray.labels" -}}
7 | app.kubernetes.io/name: {{ include "cray.fullname" . }}
8 | app.kubernetes.io/instance: {{ .Release.Name }}
9 | {{- end -}}
10 |
--------------------------------------------------------------------------------
/deployment/helm/lambda/cray/templates/configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |
12 |
13 |
--------------------------------------------------------------------------------
/deployment/helm/lambda/cray/templates/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "cray.fullname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.port }}
9 | targetPort: {{ .Values.service.targetPort }}
10 | protocol: TCP
11 | name: http
12 | - port: 8001
13 | targetPort: 8001
14 | protocol: TCP
15 | name: http2
16 | externalIPs:
17 | - {{ .Values.service.externalIP }}
18 | selector:
19 | {{- include "cray.labels" . | nindent 4 }}
20 |
--------------------------------------------------------------------------------
/deployment/helm/lambda/cray/values.yaml:
--------------------------------------------------------------------------------
1 | # Default values for cray-chart.
2 | # This is a YAML-formatted file.
3 | # Declare variables to be passed into your templates.
4 |
5 | replicaCount: 1
6 |
7 | image:
8 | repository: gdiamos/cray-nvidia
9 | tag: latest
10 | pullPolicy: IfNotPresent
11 |
12 | service:
13 | type: ClusterIP
14 | port: 8000
15 | targetPort: 8000
16 | externalIP: 104.171.203.79
17 |
18 | model: meta-llama/Llama-3.2-3B-Instruct
19 | max_model_length: 4096
20 | gpu_memory_utilization: 0.33
21 |
22 |
--------------------------------------------------------------------------------
/deployment/helm/minikube/cray/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the Cray service
4 | name: cray
5 | version: 1.0.0
6 |
--------------------------------------------------------------------------------
/deployment/helm/minikube/cray/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | gotemplate
2 | {{- define "cray.fullname" -}}
3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
4 | {{- end -}}
5 |
6 | {{- define "cray.labels" -}}
7 | app.kubernetes.io/name: {{ include "cray.fullname" . }}
8 | app.kubernetes.io/instance: {{ .Release.Name }}
9 | {{- end -}}
10 |
--------------------------------------------------------------------------------
/deployment/helm/minikube/cray/templates/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: {{ include "cray.fullname" . }}
5 | spec:
6 | replicas: {{ .Values.replicaCount }}
7 | selector:
8 | matchLabels:
9 | {{- include "cray.labels" . | nindent 6 }}
10 | template:
11 | metadata:
12 | labels:
13 | {{- include "cray.labels" . | nindent 8 }}
14 | spec:
15 | {{- with .Values.imagePullSecrets }}
16 | imagePullSecrets:
17 | {{- toYaml . | nindent 8 }}
18 | {{- end }}
19 | containers:
20 | - name: {{ .Chart.Name }}
21 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
22 | imagePullPolicy: {{ .Values.image.pullPolicy }}
23 | command: ["/app/cray/scripts/start_one_server.sh"]
24 | ports:
25 | - name: http
26 | containerPort: 8000
27 | hostPort: 8000
28 | protocol: TCP
29 | - name: http2
30 | containerPort: 8001
31 | hostPort: 8001
32 | protocol: TCP
33 | volumeMounts:
34 | {{- range .Values.volumes }}
35 | - name: {{ .name }}
36 | mountPath: {{ .path }}
37 | {{- end }}
38 | volumes:
39 | {{- range .Values.volumes }}
40 | - name: {{ .name }}
41 | hostPath:
42 | path: {{ .hostPath }}
43 | {{- end }}
44 |
--------------------------------------------------------------------------------
/deployment/helm/minikube/cray/templates/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "cray.fullname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.port }}
9 | targetPort: {{ .Values.service.targetPort }}
10 | protocol: TCP
11 | name: http
12 | - port: 8001
13 | targetPort: 8001
14 | protocol: TCP
15 | name: http2
16 | selector:
17 | {{- include "cray.labels" . | nindent 4 }}
18 |
--------------------------------------------------------------------------------
/deployment/helm/minikube/cray/values.yaml:
--------------------------------------------------------------------------------
1 | # Default values for cray-chart.
2 | # This is a YAML-formatted file.
3 | # Declare variables to be passed into your templates.
4 |
5 | replicaCount: 1
6 |
7 | image:
8 | repository: gdiamos/masint-arm
9 | tag: latest
10 | pullPolicy: IfNotPresent
11 |
12 | service:
13 | type: ClusterIP
14 | port: 8000
15 | targetPort: 8000
16 |
17 | volumes:
18 | - name: ml
19 | path: /app/cray/ml
20 | hostPath: /Users/gregorydiamos/checkout/cray/ml
21 |
22 | network:
23 | name: cray-network
24 |
25 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the ScalarLM service
4 | name: scalarlm
5 | version: 1.0.0
6 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | gotemplate
2 | {{- define "scalarlm.fullname" -}}
3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
4 | {{- end -}}
5 |
6 | {{- define "scalarlm.vllmname" -}}
7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}}
8 | {{- end -}}
9 |
10 | {{- define "scalarlm.labels" -}}
11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }}
12 | app.kubernetes.io/instance: {{ .Release.Name }}
13 | {{- end -}}
14 |
15 | {{- define "scalarlm.vllmlabels" -}}
16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }}
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | {{- end -}}
19 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/api_configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/api_configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-api-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}"
12 | server_list: api
13 | max_train_time: {{ .Values.max_train_time }}
14 |
15 |
16 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/api_service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "scalarlm.fullname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.api_port }}
9 | targetPort: 8000
10 | protocol: TCP
11 | name: http
12 | externalIPs:
13 | - {{ .Values.service.externalIP }}
14 | selector:
15 | {{- include "scalarlm.labels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/cache_pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: scalarlm-3b-cache
5 | annotations:
6 | helm.sh/resource-policy: keep
7 | spec:
8 | accessModes:
9 | - ReadWriteOnce
10 | resources:
11 | requests:
12 | storage: {{ .Values.cache_pvc.size }}
13 | storageClassName: {{ .Values.cache_pvc.storageClass }}
14 | wait_until_bound: false
15 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/jobs_pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: scalarlm-3b-jobs
5 | spec:
6 | accessModes:
7 | - ReadWriteOnce
8 | resources:
9 | requests:
10 | storage: {{ .Values.jobs_pvc.size }}
11 | storageClassName: {{ .Values.jobs_pvc.storageClass }}
12 | wait_until_bound: false
13 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/vllm_configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-vllm-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | api_url: "http://scalarlm:{{ .Values.service.api_port }}"
12 | server_list: vllm
13 |
14 |
15 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/vllm_service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "scalarlm.vllmname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.vllm_port }}
9 | targetPort: 8001
10 | protocol: TCP
11 | name: http
12 | externalIPs:
13 | - {{ .Values.service.externalIP }}
14 | selector:
15 | {{- include "scalarlm.vllmlabels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/values.yaml:
--------------------------------------------------------------------------------
1 | image:
2 | repository: tensorwave/scalarlm-amd
3 | tag: latest
4 | pullPolicy: Always
5 |
6 | service:
7 | type: ClusterIP
8 | api_port: 8100
9 | vllm_port: 8101
10 | externalIP: 64.139.222.101
11 |
12 | jobs_pvc:
13 | storageClass: local-path
14 | size: 100Gi
15 |
16 | cache_pvc:
17 | storageClass: local-path
18 | size: 16Gi
19 |
20 | model: meta-llama/Llama-3.2-3B-Instruct
21 | max_model_length: 32768
22 | gpu_memory_utilization: 0.95
23 |
24 | training_gpus: 1
25 | inference_gpus: 1
26 |
27 | max_train_time: 14400
28 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the ScalarLM service
4 | name: scalarlm
5 | version: 1.0.0
6 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | gotemplate
2 | {{- define "scalarlm.fullname" -}}
3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
4 | {{- end -}}
5 |
6 | {{- define "scalarlm.vllmname" -}}
7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}}
8 | {{- end -}}
9 |
10 | {{- define "scalarlm.labels" -}}
11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }}
12 | app.kubernetes.io/instance: {{ .Release.Name }}
13 | {{- end -}}
14 |
15 | {{- define "scalarlm.vllmlabels" -}}
16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }}
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | {{- end -}}
19 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/api_configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/api_configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-api-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}"
12 | server_list: api
13 | max_train_time: {{ .Values.max_train_time }}
14 |
15 |
16 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/api_service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "scalarlm.fullname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.api_port }}
9 | targetPort: 8000
10 | protocol: TCP
11 | name: http
12 | externalIPs:
13 | - {{ .Values.service.externalIP }}
14 | selector:
15 | {{- include "scalarlm.labels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/cache_pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: scalarlm-70b-cache
5 | annotations:
6 | helm.sh/resource-policy: keep
7 | spec:
8 | accessModes:
9 | - ReadWriteOnce
10 | resources:
11 | requests:
12 | storage: {{ .Values.cache_pvc.size }}
13 | storageClassName: {{ .Values.cache_pvc.storageClass }}
14 | wait_until_bound: false
15 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/jobs_pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: scalarlm-70b-jobs
5 | spec:
6 | accessModes:
7 | - ReadWriteOnce
8 | resources:
9 | requests:
10 | storage: {{ .Values.jobs_pvc.size }}
11 | storageClassName: {{ .Values.jobs_pvc.storageClass }}
12 | wait_until_bound: false
13 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/vllm_configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-vllm-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | api_url: "http://scalarlm:{{ .Values.service.api_port }}"
12 | server_list: vllm
13 |
14 |
15 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/vllm_service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "scalarlm.vllmname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.vllm_port }}
9 | targetPort: 8001
10 | protocol: TCP
11 | name: http
12 | externalIPs:
13 | - {{ .Values.service.externalIP }}
14 | selector:
15 | {{- include "scalarlm.vllmlabels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/values.yaml:
--------------------------------------------------------------------------------
1 | image:
2 | repository: sudnya/scalarlm-rocm
3 | tag: v0.8
4 | pullPolicy: Always
5 |
6 | service:
7 | type: ClusterIP
8 | api_port: 8200
9 | vllm_port: 8201
10 | externalIP: 64.139.222.101
11 |
12 | jobs_pvc:
13 | storageClass: local-path
14 | size: 100Gi
15 |
16 | cache_pvc:
17 | storageClass: local-path
18 | size: 200Gi
19 |
20 | model: meta-llama/Llama-3.3-70B-Instruct
21 | max_model_length: 4096
22 | gpu_memory_utilization: 0.95
23 |
24 | training_gpus: 2
25 | inference_gpus: 1
26 |
27 | max_train_time: 86400
28 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the ScalarLM service
4 | name: scalarlm
5 | version: 1.0.0
6 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | gotemplate
2 | {{- define "scalarlm.fullname" -}}
3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
4 | {{- end -}}
5 |
6 | {{- define "scalarlm.vllmname" -}}
7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}}
8 | {{- end -}}
9 |
10 | {{- define "scalarlm.labels" -}}
11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }}
12 | app.kubernetes.io/instance: {{ .Release.Name }}
13 | {{- end -}}
14 |
15 | {{- define "scalarlm.vllmlabels" -}}
16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }}
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | {{- end -}}
19 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/api_configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/api_configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-api-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}"
12 | server_list: api
13 | max_train_time: {{ .Values.max_train_time }}
14 |
15 |
16 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/api_service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "scalarlm.fullname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.api_port }}
9 | targetPort: 8000
10 | protocol: TCP
11 | name: http
12 | externalIPs:
13 | - {{ .Values.service.externalIP }}
14 | selector:
15 | {{- include "scalarlm.labels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/cache_pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: scalarlm-8b-cache
5 | annotations:
6 | helm.sh/resource-policy: keep
7 | spec:
8 | accessModes:
9 | - ReadWriteOnce
10 | resources:
11 | requests:
12 | storage: {{ .Values.cache_pvc.size }}
13 | storageClassName: {{ .Values.cache_pvc.storageClass }}
14 | wait_until_bound: false
15 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/jobs_pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: scalarlm-8b-jobs
5 | spec:
6 | accessModes:
7 | - ReadWriteOnce
8 | resources:
9 | requests:
10 | storage: {{ .Values.jobs_pvc.size }}
11 | storageClassName: {{ .Values.jobs_pvc.storageClass }}
12 | wait_until_bound: false
13 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/vllm_configmap.yaml:
--------------------------------------------------------------------------------
1 | # templates/configmap.yaml
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ .Release.Name }}-vllm-configmap
6 | data:
7 | cray-config.yaml: |
8 | model: {{ .Values.model }}
9 | max_model_length: {{ .Values.max_model_length }}
10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | api_url: "http://scalarlm:{{ .Values.service.api_port }}"
12 | server_list: vllm
13 |
14 |
15 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/vllm_service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "scalarlm.vllmname" . }}
5 | spec:
6 | type: {{ .Values.service.type }}
7 | ports:
8 | - port: {{ .Values.service.vllm_port }}
9 | targetPort: 8001
10 | protocol: TCP
11 | name: http
12 | externalIPs:
13 | - {{ .Values.service.externalIP }}
14 | selector:
15 | {{- include "scalarlm.vllmlabels" . | nindent 4 }}
16 |
--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/values.yaml:
--------------------------------------------------------------------------------
1 | image:
2 | repository: gdiamos/scalarlm-amd #tensorwave/scalarlm-amd
3 | tag: fsdp
4 | pullPolicy: Always
5 |
6 | service:
7 | type: ClusterIP
8 | api_port: 9000
9 | vllm_port: 9001
10 | externalIP: 64.139.222.101
11 |
12 | jobs_pvc:
13 | storageClass: local-path
14 | size: 100Gi
15 |
16 | cache_pvc:
17 | storageClass: local-path
18 | size: 32Gi
19 |
20 | model: meta-llama/Llama-3.1-8B-Instruct
21 | max_model_length: 4096
22 | gpu_memory_utilization: 0.95
23 |
24 | training_gpus: 2
25 | inference_gpus: 1
26 |
27 | max_train_time: 86400
28 |
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | services:
2 |
3 | cray: &cray
4 | command: /app/cray/scripts/start_one_server.sh
5 | build:
6 | context: .
7 | dockerfile: Dockerfile
8 | args:
9 | - BASE_NAME=${BASE_NAME}
10 | - VLLM_TARGET_DEVICE=${VLLM_TARGET_DEVICE}
11 | ports:
12 | - "8000:8000"
13 | - "8001:8001"
14 | volumes:
15 | - type: bind
16 | source: ./models
17 | target: /root/.cache/huggingface
18 | - type: bind
19 | source: ./infra/cray_infra
20 | target: /app/cray/infra/cray_infra
21 | - type: bind
22 | source: ./scripts
23 | target: /app/cray/scripts
24 | - type: bind
25 | source: ./ml
26 | target: /app/cray/ml
27 | - type: bind
28 | source: ./test
29 | target: /app/cray/test
30 | networks:
31 | - cray-network
32 |
33 |
34 | cray-nvidia:
35 | <<: *cray
36 | deploy:
37 | resources:
38 | reservations:
39 | devices:
40 | - driver: nvidia
41 | capabilities: [gpu]
42 |
43 | cray-amd:
44 | <<: *cray
45 | devices:
46 | - /dev/kfd
47 | - /dev/dri
48 | security_opt:
49 | - seccomp:unconfined
50 |
51 |
52 | networks:
53 | cray-network:
54 | name: cray_network
55 |
56 |
--------------------------------------------------------------------------------
/docs/cray-docs/docs/arch.md:
--------------------------------------------------------------------------------
1 | # ScalarLM
2 |
3 | ScalarLM has three high level APIs:
4 |
5 | * **completions** provides OpenAI client compatibility
6 | * **generate** provides a simple interface for generating text
7 | * **train** provides a simple interface for submitting training jobs
8 |
9 | 
10 |
11 |
12 | Inference is performed by vLLM workers that are orchestrated by pulling requests from a queue.
13 |
14 | Training is performed by Megatron-LM workers that are orchestrated by SLURM.
15 |
16 | Trained models are automatically registered with the inference workers.
17 |
--------------------------------------------------------------------------------
/docs/cray-docs/docs/assets/cray-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/docs/cray-docs/docs/assets/cray-arch.png
--------------------------------------------------------------------------------
/docs/cray-docs/docs/assets/cray.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/docs/cray-docs/docs/assets/cray.jpeg
--------------------------------------------------------------------------------
/docs/cray-docs/docs/assets/loss_plot_044db4ac60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/docs/cray-docs/docs/assets/loss_plot_044db4ac60.png
--------------------------------------------------------------------------------
/docs/cray-docs/docs/cli/list-models.md:
--------------------------------------------------------------------------------
1 | # List Models
2 |
3 | ```console
4 | ./cray llm ls
5 | ```
6 |
7 | This command lists all of the models that have been trained on the ScalarLM server.
8 |
9 | ```console
10 | 69118a251a074f9f9d37a2ddc903243e428d30c3c31ad019cbf62ac777e42e6e
11 | ```
12 |
13 | ScalarLM names models with a unique identifier based on the input data and training parameters.
14 |
15 |
--------------------------------------------------------------------------------
/docs/cray-docs/docs/cli/plot.md:
--------------------------------------------------------------------------------
1 | # Plot
2 |
3 | ```console
4 | ./cray llm plot
5 | ```
6 |
7 | This command plots the training loss of a specified model.
8 |
9 | If no model is specified, the command will plot the training loss of the most recently trained model.
10 |
11 | 
12 |
13 |
14 |
--------------------------------------------------------------------------------
/docs/cray-docs/docs/cli/squeue.md:
--------------------------------------------------------------------------------
1 | # squeue
2 |
3 | ```console
4 | ./cray llm squeue
5 | ```
6 |
7 | This command is a wrapper around the `squeue` command. It is used to display the status of jobs in the training queue. The output is similar to the `squeue` command, but with some additional formatting.
8 |
9 | ```console
10 | JOBID PARTITION NAME USER STATE TIME TIME_LIMI NODES NODELIST(REASON)
11 | 8 short 00f186ab039b root PENDING 0:00 20:00 1 (Priority)
12 | 7 short f1ba9c0eb11b root PENDING 0:00 20:00 1 (Priority)
13 | 6 short 0746261fd1db root PENDING 0:00 20:00 1 (Priority)
14 | 5 short ae55dedbb496 root PENDING 0:00 20:00 1 (Priority)
15 | 4 short d2bc30a36081 root PENDING 0:00 20:00 1 (Priority)
16 | 3 short bce8e63a7bef root PENDING 0:00 20:00 1 (Resources)
17 | 2 short c42b59ab0fb1 root RUNNING 0:34 20:00 1 df294b9206ff
18 | ```
19 |
20 |
21 |
--------------------------------------------------------------------------------
/docs/cray-docs/docs/contact.md:
--------------------------------------------------------------------------------
1 | # Contact Us
2 |
3 | Project ScalarLM is developed by an Artificial Intelligence engineering consortium, built on a philosophy of open collaboration to improve AI systems. Through our collective engineering efforts with industry and academia we continually integrate and improve the accuracy, safety, speed, and efficiency of AI technologies–helping companies and universities around the world build better AI systems that will benefit society.
4 |
5 | [Get in Touch](https://forms.gle/tk6LFVrTQDSQp8L69)
6 |
7 |
8 | * Greg Diamos
9 | * Naila Farooqui
10 | * Sudnya Diamos
11 | * Suhabe Bugrara
12 |
13 |
14 | We accept community contributions and are always looking for new collaborators. If you are interested in contributing to Project ScalarLM, please reach out to us at [Get in Touch](https://forms.gle/tk6LFVrTQDSQp8L69).
15 |
16 |
17 |
--------------------------------------------------------------------------------
/docs/cray-docs/docs/deployment/docker.md:
--------------------------------------------------------------------------------
1 | # Docker builds
2 |
3 | Check out prebuilt docker containers for different targets:
4 |
5 | | Target | Container | Latest Release v0.5 |
6 | -------- | --------------------------- | ------------------------ |
7 | | NVIDIA | gdiamos/cray-nvidia:latest | gdiamos/cray-nvidia:v0.5 |
8 | | ARM | gdiamos/cray-arm:latest | gdiamos/cray-arm:v0.5 |
9 | | AMD | gdiamos/cray-amd:latest | gdiamos/cray-amd:v0.5 |
10 | | x86 | gdiamos/cray-cpu:latest | gdiamos/cray-cpu:v0.5 |
11 |
12 | For example, to launch a development server on a modern macbook, e.g. m2
13 |
14 | ```bash
15 | docker run -it -p 8000:8000 --entrypoint /app/cray/scripts/start_one_server.sh gdiamos/cray-arm:v0.5
16 | ```
17 |
--------------------------------------------------------------------------------
/docs/cray-docs/docs/deployment/modal.md:
--------------------------------------------------------------------------------
1 | # Modal
2 |
3 | ScalarLM can be deployed on Modal for easy access to GPUs.
4 |
5 | Clone the [ScalarLM repository](https://github.com/tensorwavecloud/scalarlm) and start the server.
6 |
7 | ```console
8 | git clone git@github.com:tensorwavecloud/scalarlm.git
9 | cd cray
10 | ./cray deploy
11 | ```
12 |
13 | Modal should give you an endpoint you can start using.
14 |
15 |
16 |
--------------------------------------------------------------------------------
/docs/cray-docs/docs/index.md:
--------------------------------------------------------------------------------
1 | # Welcome to ScalarLM
2 |
3 | ScalarLM is a fully open source, CC-0 Licensed, integrated LLM inference and training platform.
4 |
5 | ScalarLM builds on top of the vLLM inference engine, the Megatron-LM training framework, and the HuggingFace model hub. It unifies the capabilities of these tools into a single platform, enabling users to easily perform LLM inference and training, and build higher lever applications such as Agents with a twist - they can teach themselves new abilities via back propagation.
6 |
7 | ScalarLM is designed for high peformance. It inherits the distributed training capabilities of Megatron-LM and the optimized inference engine of vLLM. Cray is also designed to be easy to use. It provides an OpenAI compatible server and a simple command line interface for users to interact with the platform.
8 |
9 | ScalarLM is inspired by the work of Seymour Roger Cray (September 28, 1925 – October 5, 1996), an American electrical engineer and supercomputer architect who designed a series of computers that were the fastest in the world for decades, and founded Cray Research, which built many of these machines. Called "the father of supercomputing", Cray has been credited with creating the supercomputer industry.
10 |
11 | Learn more about ScalarLM at our [Blog](https://blog.scalarlm.com) and [GitHub](https://github.com/scalarlm/scalarlm).
12 |
13 | [Get in Touch](https://forms.gle/tk6LFVrTQDSQp8L69)
14 |
15 | 
16 |
17 |
18 |
--------------------------------------------------------------------------------
/docs/cray-docs/docs/inference.md:
--------------------------------------------------------------------------------
1 | # Inference
2 |
3 |
4 | ## OpenAI Compatible Server
5 |
6 | ```console
7 | curl https://meta-llama--llama-3-2-3b-instruct.cray-lm.com/v1/openai/chat/completions \
8 | -H "Content-Type: application/json" \
9 | -d '{
10 | "model": "meta-llama/Llama-3.2-3B-Instruct",
11 | "messages": [
12 | {"role": "system", "content": "You are a helpful assistant."},
13 | {"role": "user", "content": "Who won the world series in 2020?"}
14 | ]
15 | }'
16 | ```
17 |
18 | ## Using the Python client
19 |
20 | You can also use the Python client to interact with the ScalarLM server.
21 |
22 | ```python
23 |
24 | import masint
25 |
26 | masint.api_url = "https://meta-llama--llama-3-2-3b-instruct.cray-lm.com"
27 |
28 | def get_dataset():
29 | dataset = []
30 |
31 | count = 4
32 |
33 | for i in range(count):
34 | dataset.append(f"What is {i} + {i}?")
35 |
36 | return dataset
37 |
38 |
39 | llm = masint.SupermassiveIntelligence()
40 |
41 | dataset = get_dataset()
42 |
43 | results = llm.generate(prompts=dataset)
44 |
45 | print(results)
46 | ```
47 |
48 |
--------------------------------------------------------------------------------
/docs/cray-docs/docs/training.md:
--------------------------------------------------------------------------------
1 | # Training
2 |
3 | ## Training jobs
4 |
5 | You can also use the Python client to submit training jobs to the ScalarLM server.
6 |
7 | ```python
8 |
9 | import masint
10 |
11 | def get_dataset():
12 | dataset = []
13 |
14 | count = 5
15 |
16 | for i in range(count):
17 | dataset.append(
18 | {"input": f"What is {i} + {i}?", "output": str(i + i)}
19 | )
20 |
21 | return dataset
22 |
23 |
24 | llm = masint.SupermassiveIntelligence()
25 |
26 | dataset = get_dataset()
27 |
28 | status = llm.train(dataset, train_args={"max_steps": 200, "learning_rate": 3e-3})
29 |
30 | print(status)
31 | ```
32 |
33 | You get a command line output like this:
34 |
35 | ```console
36 | (environment) gregorydiamos@Air-Gregory cray % python test/deployment/train.py
37 | {'job_id': '1', 'status': 'QUEUED', 'message': 'Training job launched', 'dataset_id': 'dataset', 'job_directory': '/app/cray/jobs/69118a251a074f9f9d37a2ddc903243e428d30c3c31ad019cbf62ac777e42e6e', 'model_name': '69118a251a074f9f9d37a2ddc903243e428d30c3c31ad019cbf62ac777e42e6e'}
38 | ```
39 |
40 |
--------------------------------------------------------------------------------
/docs/cray-docs/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: ScalarLM Docs
2 | nav:
3 | - Getting Started:
4 | - Introduction: index.md
5 | - Quick Start: quickstart.md
6 | - Architecture: arch.md
7 | - Contact: contact.md
8 | - Examples:
9 | - Inference: inference.md
10 | - Training: training.md
11 | - Command Line:
12 | - CLI: cli/cli.md
13 | - List Models: cli/list-models.md
14 | - Training Logs: cli/training-logs.md
15 | - Squeue: cli/squeue.md
16 | - Plot: cli/plot.md
17 | - Deployment:
18 | - Laptop: deployment/laptop.md
19 | - Kubernetes: deployment/kubernetes.md
20 | - Modal: deployment/modal.md
21 | - Modal Details: deployment/modal-details.md
22 | - Docker: deployment/docker.md
23 |
--------------------------------------------------------------------------------
/docs/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Safely execute this bash script
4 | # e exit on first failure
5 | # x all executed commands are printed to the terminal
6 | # u unset variables are errors
7 | # a export all variables to the environment
8 | # E any trap on ERR is inherited by shell functions
9 | # -o pipefail | produces a failure code if any stage fails
10 | set -Eeuoxa pipefail
11 |
12 | # Get the directory of this script
13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
14 |
15 | # Clean up the old deployment directory
16 | rm -rf $LOCAL_DIRECTORY/gh-pages-deployment
17 |
18 | # Clone the repository
19 | git clone git@github.com:tensorwavecloud/scalarlm $LOCAL_DIRECTORY/gh-pages-deployment
20 |
21 | # Change to the deployment directory
22 | cd $LOCAL_DIRECTORY/gh-pages-deployment
23 |
24 | # Change to the git branch
25 | git checkout gh-pages
26 |
27 | # Copy the local files from cray-docs to the deployment directory
28 | cp $LOCAL_DIRECTORY/cray-docs/mkdocs.yml $LOCAL_DIRECTORY/gh-pages-deployment
29 | cp -r $LOCAL_DIRECTORY/cray-docs/docs $LOCAL_DIRECTORY/gh-pages-deployment/docs
30 |
31 | # Add all the files to the git repository
32 | #git add .
33 |
34 | # Commit the changes
35 | #git commit -m "Deploying the latest documentation"
36 |
37 | # Run mkdocs gh-deploy
38 | mkdocs gh-deploy
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/frontend/assets/logo.svg:
--------------------------------------------------------------------------------
1 |
33 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/aiohttp/get_global_session.py:
--------------------------------------------------------------------------------
1 | import aiohttp
2 |
3 | session = None
4 |
5 |
6 | def get_global_session():
7 | global session
8 | if session is None:
9 | session = aiohttp.ClientSession()
10 | return session
11 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/generate/finish_work.py:
--------------------------------------------------------------------------------
1 | from cray_infra.api.work_queue.inference_work_queue import get_inference_work_queue
2 |
3 | from cray_infra.api.fastapi.routers.request_types.finish_work_request import FinishWorkRequests
4 |
5 | import logging
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 | async def finish_work(requests : FinishWorkRequests):
10 | inference_work_queue = get_inference_work_queue()
11 |
12 | for request in requests.requests:
13 | logger.debug(f"Finishing work for request {request.request_id}")
14 |
15 | result = inference_work_queue.get_id(id=request.request_id)
16 |
17 | if request.response is not None:
18 | result["response"] = request.response
19 |
20 | if request.error is not None:
21 | result["error"] = request.error
22 |
23 | inference_work_queue.update(id=request.request_id, item=result)
24 |
25 | inference_work_queue.ack(id=request.request_id)
26 |
27 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/generate/get_results.py:
--------------------------------------------------------------------------------
1 | from cray_infra.api.fastapi.routers.request_types.get_results_request import GetResultsRequest
2 |
3 | from cray_infra.api.fastapi.generate.poll_for_responses import poll_for_responses
4 |
5 |
6 | async def get_results(request: GetResultsRequest):
7 | return await poll_for_responses(request.request_ids)
8 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/health/check_health.py:
--------------------------------------------------------------------------------
1 | from cray_infra.api.fastapi.aiohttp.get_global_session import get_global_session
2 | from cray_infra.util.get_config import get_config
3 |
4 |
5 | async def check_health():
6 | vllm_health = await get_vllm_health()
7 | api_health = "up"
8 | all_health = get_all_health([vllm_health, api_health])
9 | return {"api": "up", "vllm": vllm_health, "all": all_health}
10 |
11 |
12 | def get_all_health(healths):
13 | if all(health == "up" for health in healths):
14 | return "up"
15 |
16 | if all(health == "down" for health in healths):
17 | return "down"
18 |
19 | return "mixed"
20 |
21 |
22 | async def get_vllm_health():
23 | try:
24 | session = get_global_session()
25 | config = get_config()
26 | async with session.get(config["vllm_api_url"] + "/health") as resp:
27 | assert resp.status == 200
28 | return "up"
29 | except Exception as e:
30 | return {"status": "down", "reason": str(e)}
31 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/main.py:
--------------------------------------------------------------------------------
1 | from cray_infra.api.fastapi.routers.openai_router import (
2 | openai_router,
3 | )
4 | from cray_infra.api.fastapi.routers.megatron_router import (
5 | megatron_router,
6 | )
7 | from cray_infra.api.fastapi.routers.health_router import (
8 | health_router,
9 | )
10 | from cray_infra.api.fastapi.routers.generate_router import (
11 | generate_router,
12 | )
13 |
14 | from cray_infra.api.fastapi.tasks.add_megatron_tasks import (
15 | add_megatron_tasks,
16 | )
17 |
18 | from fastapi import FastAPI, Request
19 | from fastapi.middleware.cors import CORSMiddleware
20 |
21 | import logging
22 | import os
23 |
24 |
25 | logger = logging.getLogger(__name__)
26 |
27 |
28 | app = FastAPI(lifespan=add_megatron_tasks)
29 |
30 | app.include_router(openai_router, prefix="/v1")
31 | app.include_router(megatron_router, prefix="/v1")
32 | app.include_router(health_router, prefix="/v1")
33 | app.include_router(generate_router, prefix="/v1")
34 |
35 |
36 | origins = [
37 | "http://localhost:3000",
38 | ]
39 |
40 | app.add_middleware(
41 | CORSMiddleware,
42 | allow_origins=origins,
43 | allow_credentials=True,
44 | allow_methods=["*"],
45 | allow_headers=["*"],
46 | )
47 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/health_router.py:
--------------------------------------------------------------------------------
1 | from cray_infra.api.fastapi.health.check_health import check_health
2 |
3 | from fastapi import APIRouter
4 |
5 | from fastapi.responses import JSONResponse
6 | import logging
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 | health_router = APIRouter(prefix="/health")
11 |
12 |
13 | @health_router.get("")
14 | async def health():
15 | return await check_health()
16 |
17 |
18 | @health_router.get("/keepalive")
19 | async def health():
20 | return {"status": "ok"}
21 |
22 |
23 | @health_router.get("/endpoints")
24 | async def list_routes():
25 | routes = [
26 | f"Path: {route.path}, Methods: {', '.join(route.methods)}"
27 | for route in health_router.routes
28 | ]
29 | return JSONResponse(content={"endpoints": routes}, media_type="application/json")
30 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/embed_request.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from typing import Optional
4 |
5 |
6 | class EmbedRequest(BaseModel):
7 | model: Optional[str] = None
8 | prompts: list[str]
9 |
10 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/finish_work_request.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from typing import Optional, Union
4 |
5 |
6 | class FinishWorkRequest(BaseModel):
7 | request_id: int
8 | response: Optional[Union[str, list[float]]] = None
9 | error: Optional[str] = None
10 |
11 |
12 | class FinishWorkRequests(BaseModel):
13 | requests: list[FinishWorkRequest]
14 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/generate_request.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from typing import Optional
4 |
5 |
6 | class GenerateRequest(BaseModel):
7 | model: Optional[str] = None
8 | prompts: list[str]
9 | max_tokens: Optional[int] = 16
10 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/generate_response.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from typing import Optional, Union
4 |
5 | class Result(BaseModel):
6 | request_id: int
7 | response: Optional[Union[str, list[float]]] = None
8 | error: Optional[str] = None
9 |
10 | class GenerateResponse(BaseModel):
11 | results: list[Result]
12 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/get_results_request.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from typing import Optional
4 |
5 | class GetResultsRequest(BaseModel):
6 | request_ids: list[int]
7 |
8 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/get_results_response.py:
--------------------------------------------------------------------------------
1 | from cray_infra.api.fastapi.routers.request_types.generate_response import GenerateResponse as GetResultsResponse
2 |
3 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/get_work_request.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from typing import Optional
4 |
5 | class GetWorkRequest(BaseModel):
6 | batch_size: int
7 |
8 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/get_work_response.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from typing import Optional
4 |
5 |
6 | class GetWorkResponse(BaseModel):
7 | prompt: str
8 | request_id: int
9 | request_type: str
10 | model: Optional[str] = None
11 | max_tokens: Optional[int] = None
12 |
13 |
14 | class GetWorkResponses(BaseModel):
15 | requests: list[GetWorkResponse]
16 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/list_models_response.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 |
4 | class ListModelsResponse(BaseModel):
5 | models: list[dict]
6 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/squeue_response.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from typing import Optional
4 |
5 |
6 | class SqueueResponse(BaseModel):
7 | squeue_output : Optional[str] = None
8 | error_message : Optional[str] = None
9 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/train_request.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from typing import Optional
4 |
5 |
6 | class TrainResponse(BaseModel):
7 | job_status: dict
8 | job_config: dict
9 | deployed: Optional[bool] = False
10 |
--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/tasks/add_megatron_tasks.py:
--------------------------------------------------------------------------------
1 | from cray_infra.util.get_config import get_config
2 |
3 | from cray_infra.training.restart_megatron_jobs import restart_megatron_jobs
4 | from cray_infra.training.register_megatron_models import register_megatron_models
5 | from cray_infra.generate.clear_acked_requests_from_queue import clear_acked_requests_from_queue
6 |
7 | from fastapi_utils.tasks import repeat_every
8 |
9 | from contextlib import asynccontextmanager
10 |
11 | import traceback
12 | import sys
13 | import logging
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | @asynccontextmanager
19 | async def add_megatron_tasks(app):
20 | config = get_config()
21 |
22 | megatron_refresh_period = config["megatron_refresh_period"]
23 |
24 | @repeat_every(seconds=megatron_refresh_period)
25 | async def run_megatron_tasks():
26 | try:
27 | await register_megatron_models()
28 | await restart_megatron_jobs()
29 | await clear_acked_requests_from_queue()
30 | except Exception as e:
31 | print_exception()
32 | raise e
33 |
34 | await run_megatron_tasks()
35 |
36 | yield
37 |
38 |
39 | def print_exception():
40 | exc_type, exc_value, exc_traceback = sys.exc_info()
41 | messages = traceback.format_exception(exc_type, exc_value, exc_traceback)
42 |
43 | logger.error("".join(messages))
44 |
--------------------------------------------------------------------------------
/infra/cray_infra/generate/clear_acked_requests_from_queue.py:
--------------------------------------------------------------------------------
1 | from cray_infra.api.work_queue.inference_work_queue import get_inference_work_queue
2 |
3 | import logging
4 |
5 | logger = logging.getLogger(__name__)
6 |
7 | async def clear_acked_requests_from_queue():
8 | inference_work_queue = get_inference_work_queue()
9 |
10 | starting_size = len(inference_work_queue)
11 |
12 | inference_work_queue.clear_acked_data()
13 |
14 | ending_size = len(inference_work_queue)
15 |
16 | logger.info(f"Cleared {starting_size - ending_size} acked requests from the queue.")
17 |
18 |
--------------------------------------------------------------------------------
/infra/cray_infra/one_server/create_api.py:
--------------------------------------------------------------------------------
1 | import uvicorn
2 |
3 |
4 | async def create_api(port, running_status):
5 | server_config = uvicorn.Config(
6 | "cray_infra.api.fastapi.main:app",
7 | host="0.0.0.0",
8 | port=port,
9 | log_level="info",
10 | )
11 | server = uvicorn.Server(server_config)
12 | running_status.servers.append(server)
13 |
14 | await server.serve()
15 |
--------------------------------------------------------------------------------
/infra/cray_infra/one_server/create_vllm.py:
--------------------------------------------------------------------------------
1 | from cray_infra.util.get_config import get_config
2 |
3 | from vllm.entrypoints.openai.api_server import run_server
4 | from vllm.entrypoints.openai.cli_args import make_arg_parser
5 | from vllm.utils import FlexibleArgumentParser
6 |
7 | import torch
8 |
9 | import uvicorn
10 | import os
11 |
12 | import logging
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 | async def create_vllm(port, running_status):
17 |
18 | os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"
19 |
20 | config = get_config()
21 |
22 | parser = FlexibleArgumentParser(
23 | description="vLLM OpenAI-Compatible RESTful API server."
24 | )
25 | parser = make_arg_parser(parser)
26 | args = [
27 | f"--dtype={config['dtype']}",
28 | f"--max-model-len={config['max_model_length']}",
29 | f"--max-num-batched-tokens={config['max_model_length']}",
30 | f"--max-seq-len-to-capture={config['max_model_length']}",
31 | f"--gpu-memory-utilization={config['gpu_memory_utilization']}",
32 | f"--max-log-len={config['max_log_length']}",
33 | f"--swap-space=0",
34 | "--enable-lora",
35 | "--disable-async-output-proc", # Disable async output processing for embeddings
36 | ]
37 |
38 | if torch.cuda.is_available():
39 | args.append("--device=cuda")
40 |
41 | args = parser.parse_args(args=args)
42 |
43 | args.port = port
44 | args.model = config["model"]
45 |
46 | logger.info(f"Running vLLM with args: {args}")
47 |
48 | await run_server(args, running_status)
49 |
--------------------------------------------------------------------------------
/infra/cray_infra/one_server/start_cray_server.py:
--------------------------------------------------------------------------------
1 | from cray_infra.one_server.create_api import create_api
2 | from cray_infra.one_server.create_vllm import create_vllm
3 |
4 | import asyncio
5 | import logging
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 |
10 | async def start_cray_server(server_list: list):
11 |
12 | running_status = ServerStatus()
13 |
14 | logger.debug(f"Starting servers: {server_list}")
15 |
16 | if ("api" in server_list) or ("all" in server_list):
17 | logger.debug("Starting API server")
18 | api_task = asyncio.create_task(
19 | create_api(port=8000, running_status=running_status)
20 | )
21 | running_status.tasks.append(api_task)
22 |
23 | if ("vllm" in server_list) or ("all" in server_list):
24 | logger.debug("Starting VLLM server")
25 | vllm_task = asyncio.create_task(
26 | create_vllm(port=8001, running_status=running_status)
27 | )
28 | running_status.tasks.append(vllm_task)
29 |
30 | return running_status
31 |
32 |
33 | class ServerStatus:
34 | def __init__(self):
35 | self.servers = []
36 | self.tasks = []
37 |
38 | async def shutdown(self):
39 | for task in self.tasks:
40 | logger.debug(f"Task {task} is cancelled")
41 | task.cancel()
42 |
43 | for server in self.servers:
44 | logger.debug(f"Server {server} is cancelled")
45 | await server.shutdown()
46 |
--------------------------------------------------------------------------------
/infra/cray_infra/one_server/wait_for_vllm.py:
--------------------------------------------------------------------------------
1 | from cray_infra.util.get_config import get_config
2 |
3 | import asyncio
4 | import aiohttp
5 |
6 | import logging
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | async def wait_for_vllm():
12 | for _ in range(30):
13 | health_status = await get_vllm_health()
14 | if health_status == 200:
15 | return
16 | await asyncio.sleep(1)
17 |
18 |
19 | async def get_vllm_health():
20 | config = get_config()
21 |
22 | try:
23 | async with aiohttp.ClientSession() as session:
24 | async with session.get(config["vllm_api_url"] + "/health") as response:
25 | return response.status
26 | except Exception as e:
27 | logger.error(f"Error getting health: {e}")
28 | return 500
29 |
--------------------------------------------------------------------------------
/infra/cray_infra/training/get_latest_model.py:
--------------------------------------------------------------------------------
1 | from cray_infra.util.get_config import get_config
2 |
3 | import os
4 | import json
5 |
6 |
7 | def get_latest_model():
8 | config = get_config()
9 |
10 | if not os.path.exists(config["training_job_directory"]):
11 | raise FileNotFoundError("No training jobs found")
12 |
13 | # Get the latest model by timestamp
14 | models = os.listdir(config["training_job_directory"])
15 |
16 | if len(models) == 0:
17 | raise FileNotFoundError("No training jobs found")
18 |
19 | models.sort(
20 | key=lambda x: get_start_time(os.path.join(config["training_job_directory"], x)),
21 | reverse=True,
22 | )
23 |
24 | model_name = models[0]
25 |
26 | return model_name
27 |
28 |
29 | def get_start_time(path):
30 | with open(os.path.join(path, "status.json")) as f:
31 | status = json.load(f)
32 |
33 | if "history" not in status:
34 | return 0
35 |
36 | return status.get("start_time", 0)
37 |
--------------------------------------------------------------------------------
/infra/cray_infra/training/metrics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from gpu_aware_mpi import get_rank
3 |
4 | import logging
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 | def log_gpu_memory(prefix=""):
9 | for i in range(torch.cuda.device_count()):
10 | free, total = torch.cuda.mem_get_info(i)
11 | rank = get_rank()
12 | if rank == 0:
13 | logger.debug(f"{prefix} GPU {i}: Free={free/1e6:.2f}MB, Total={total/1e6:.2f}MB")
14 |
15 | def get_model_memory_footprint(model):
16 | param_size = 0
17 | for param in model.parameters():
18 | param_size += param.numel() * param.element_size()
19 | buffer_size = 0
20 | for buffer in model.buffers():
21 | buffer_size += buffer.numel() * buffer.element_size()
22 | total_size = param_size + buffer_size
23 | return total_size # in bytes
--------------------------------------------------------------------------------
/infra/cray_infra/training/squeue.py:
--------------------------------------------------------------------------------
1 | from cray_infra.api.fastapi.routers.request_types.squeue_response import SqueueResponse
2 |
3 | import subprocess
4 |
5 |
6 | async def squeue():
7 | try:
8 | squeue_output = subprocess.check_output(
9 | ["squeue", '--format=%.18i %.9P %.12j %.8u %.8T %.10M %.9l %.6D %R']
10 | )
11 |
12 | return SqueueResponse(
13 | squeue_output=squeue_output.decode("utf-8"),
14 | )
15 |
16 | except subprocess.CalledProcessError:
17 | return SqueueResponse(
18 | error_message="squeue command failed",
19 | )
20 |
--------------------------------------------------------------------------------
/infra/cray_infra/training/training_job_status.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 |
4 | class TrainingJobStatus(str, Enum):
5 | QUEUED = "QUEUED"
6 | TRAINING = "TRAINING"
7 | COMPLETED = "COMPLETED"
8 | FAILED = "FAILED"
9 |
--------------------------------------------------------------------------------
/infra/cray_infra/training/vllm_model_manager.py:
--------------------------------------------------------------------------------
1 | class VLLMModelManager:
2 | def __init__(self):
3 | self._models = []
4 |
5 | def set_registered_models(self, models):
6 | self._models = models
7 |
8 | def get_registered_models(self):
9 | return self._models
10 |
11 | def find_model(self, model_name):
12 | for model in self._models:
13 | if model_name in model:
14 | return model
15 | return None
16 |
17 |
18 | def get_vllm_model_manager():
19 | """
20 | Returns a singleton instance of VLLMModelManager.
21 | """
22 | if not hasattr(get_vllm_model_manager, "_instance"):
23 | get_vllm_model_manager._instance = VLLMModelManager()
24 | return get_vllm_model_manager._instance
25 |
--------------------------------------------------------------------------------
/infra/cray_infra/util/default_config.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 |
4 | class Config(BaseModel):
5 | api_url: str = "http://localhost:8000"
6 |
7 | model: str = "meta-llama/llama-3.1-8b-instruct"
8 |
9 | # 10GB using 1024 for KB, 1024 for MB, 1024 for GB
10 | max_upload_file_size: int = 1024 * 1024 * 1024 * 10
11 |
12 | train_job_entrypoint: str = "/app/cray/scripts/train_job_entrypoint.sh"
13 | training_job_directory: str = "/app/cray/jobs"
14 |
15 | max_train_time: int = 15 * 60
16 | extra_training_seconds: int = 300 # 5 minutes buffer before slurm kills the job
17 |
18 | slurm_wait_time: int = 30 # seconds
19 |
20 | megatron_refresh_period: int = 30 # seconds
21 |
22 | vllm_api_url: str = "http://localhost:8001"
23 |
24 | generate_batch_size: int = 1024
25 |
26 | response_timeout: int = 60 # seconds
27 | inference_work_queue_timeout: int = 30 # seconds
28 |
29 | inference_work_queue_path: str = "/app/cray/inference_work_queue.sqlite"
30 |
31 | gpu_memory_utilization: float = 0.50
32 | max_model_length: int = 8192
33 | dtype: str = "bfloat16"
34 |
35 | max_log_length: int = 100
36 |
37 | server_list: str = "all"
38 |
39 | tokenformer_r: int = 32
40 | tokenformer_num_heads: int = 4
41 |
42 | tokenformer_cache_capacity: int = 2
43 |
44 |
--------------------------------------------------------------------------------
/infra/cray_infra/util/default_job_config.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from typing import Optional
4 |
5 |
6 | class LoraConfig(BaseModel):
7 | r: int = 32
8 | target_modules: str = "all-linear"
9 | use_rslora: bool = True
10 | modules_to_save: list = ["lm_head"]
11 |
12 |
13 | class DiffusionForcingModelConfig(BaseModel):
14 | num_hidden_layers: int = 2
15 | num_diffusion_iterations: int = 3
16 | diffusion_step_size: int = 2
17 | hidden_size: int = 128
18 | num_attention_heads: int = 4
19 | attention_dropout: float = 0.1
20 |
21 |
22 | class JobConfig(BaseModel):
23 |
24 | job_directory: str
25 | training_data_path: str
26 | dataset_hash: str
27 |
28 | #llm_name: str = "masint/tiny-random-llama"
29 | llm_name: str = "meta-llama/Llama-3.2-1B-Instruct"
30 |
31 | # Training
32 | max_steps: int = 100
33 | learning_rate: float = 3e-3
34 | batch_size: int = 1
35 | gradient_clip_value: float = 1.0
36 |
37 | max_token_block_size: int = 16777216 # 16 mega tokens
38 |
39 | # Checkpointing
40 | steps_per_checkpoint: int = 100
41 | max_checkpoints_to_keep: int = 3
42 |
43 | gpus: int = 1
44 | nodes: int = 1
45 |
46 | lora_config: Optional[LoraConfig] = LoraConfig()
47 | diffusion_forcing_config: Optional[DiffusionForcingModelConfig] = (
48 | DiffusionForcingModelConfig()
49 | )
50 |
51 | # 4 hours in seconds
52 | timeout: int = 4 * 60 * 60
53 |
54 | training_history_length: int = 1024
55 |
56 |
--------------------------------------------------------------------------------
/infra/cray_infra/util/get_config.py:
--------------------------------------------------------------------------------
1 | from cray_infra.util.default_config import Config
2 |
3 | import os
4 | import yaml
5 |
6 |
7 | def get_config():
8 | loaded_config = {}
9 |
10 | config_path = "/app/cray/cray-config.yaml"
11 |
12 | if os.path.exists(config_path):
13 | with open(config_path, "r") as stream:
14 | loaded_config = yaml.safe_load(stream)
15 |
16 | return Config(**loaded_config).dict()
17 |
--------------------------------------------------------------------------------
/infra/cray_infra/util/get_job_config.py:
--------------------------------------------------------------------------------
1 | from cray_infra.util.default_job_config import JobConfig
2 |
3 | import yaml
4 | import os
5 |
6 |
7 | def get_job_config():
8 | job_config_path = get_job_config_path()
9 |
10 | with open(job_config_path, "r") as stream:
11 | job_config = yaml.safe_load(stream)
12 |
13 | # fill in missing values with defaults
14 | job_config = JobConfig(**job_config).dict()
15 |
16 | return job_config
17 |
18 |
19 | def get_job_config_path():
20 | assert (
21 | "CRAY_TRAINING_JOB_CONFIG_PATH" in os.environ
22 | ), "CRAY_TRAINING_JOB_CONFIG_PATH not set"
23 | return os.environ["CRAY_TRAINING_JOB_CONFIG_PATH"]
24 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/__init__.py:
--------------------------------------------------------------------------------
1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
2 |
3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
4 | from vllm.engine.async_llm_engine import AsyncLLMEngine
5 | from vllm.engine.llm_engine import LLMEngine
6 | from vllm.entrypoints.llm import LLM
7 | from vllm.executor.ray_utils import initialize_ray_cluster
8 | from vllm.inputs import PromptType, TextPrompt, TokensPrompt
9 | from vllm.model_executor.models import ModelRegistry
10 | from vllm.outputs import (
11 | CompletionOutput,
12 | EmbeddingOutput,
13 | EmbeddingRequestOutput,
14 | RequestOutput,
15 | )
16 | from vllm.pooling_params import PoolingParams
17 | from vllm.sampling_params import SamplingParams
18 |
19 | from .version import __version__, __version_tuple__
20 |
21 | __all__ = [
22 | "__version__",
23 | "__version_tuple__",
24 | "LLM",
25 | "ModelRegistry",
26 | "PromptType",
27 | "TextPrompt",
28 | "TokensPrompt",
29 | "SamplingParams",
30 | "RequestOutput",
31 | "CompletionOutput",
32 | "EmbeddingOutput",
33 | "EmbeddingRequestOutput",
34 | "LLMEngine",
35 | "EngineArgs",
36 | "AsyncLLMEngine",
37 | "AsyncEngineArgs",
38 | "initialize_ray_cluster",
39 | "PoolingParams",
40 | ]
41 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/_version.py:
--------------------------------------------------------------------------------
1 | # file generated by setuptools_scm
2 | # don't change, don't track in version control
3 | TYPE_CHECKING = False
4 | if TYPE_CHECKING:
5 | from typing import Tuple, Union
6 |
7 | VERSION_TUPLE = Tuple[Union[int, str], ...]
8 | else:
9 | VERSION_TUPLE = object
10 |
11 | version: str
12 | __version__: str
13 | __version_tuple__: VERSION_TUPLE
14 | version_tuple: VERSION_TUPLE
15 |
16 | __version__ = version = "0.1.dev5+g815064c.d20241108"
17 | __version_tuple__ = version_tuple = (0, 1, "dev5", "g815064c.d20241108")
18 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/adapter_commons/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/adapter_commons/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/adapter_commons/layers.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Tuple
3 |
4 |
5 | @dataclass
6 | class AdapterMapping:
7 | # Per every token in input_ids:
8 | index_mapping: Tuple[int, ...]
9 | # Per sampled token:
10 | prompt_mapping: Tuple[int, ...]
11 |
12 | def __post_init__(self):
13 | self.index_mapping = tuple(self.index_mapping)
14 | self.prompt_mapping = tuple(self.prompt_mapping)
15 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/adapter_commons/request.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 |
4 | class AdapterRequest(ABC):
5 | """
6 | Base class for adapter requests.
7 | """
8 |
9 | @property
10 | @abstractmethod
11 | def adapter_id(self) -> int:
12 | raise NotImplementedError
13 |
14 | def __post_init__(self) -> None:
15 | if self.adapter_id < 1:
16 | raise ValueError(f"id must be > 0, got {self.adapter_id}")
17 |
18 | def __eq__(self, value: object) -> bool:
19 | return isinstance(value, self.__class__) and self.adapter_id == value.adapter_id
20 |
21 | def __hash__(self) -> int:
22 | return hash(self.adapter_id)
23 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/adapter_commons/worker_manager.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Any, Optional, Set
3 |
4 | import torch
5 |
6 |
7 | class AbstractWorkerManager(ABC):
8 |
9 | def __init__(self, device: torch.device):
10 | self.device = device
11 |
12 | @property
13 | @abstractmethod
14 | def is_enabled(self) -> bool:
15 | raise NotImplementedError
16 |
17 | @abstractmethod
18 | def set_active_adapters(self, requests: Set[Any], mapping: Optional[Any]) -> None:
19 | raise NotImplementedError
20 |
21 | @abstractmethod
22 | def add_adapter(self, adapter_request: Any) -> bool:
23 | raise NotImplementedError
24 |
25 | @abstractmethod
26 | def remove_adapter(self, adapter_id: int) -> bool:
27 | raise NotImplementedError
28 |
29 | @abstractmethod
30 | def remove_all_adapters(self) -> None:
31 | raise NotImplementedError
32 |
33 | @abstractmethod
34 | def list_adapters(self) -> Set[int]:
35 | raise NotImplementedError
36 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/assets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/assets/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/assets/audio.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Literal, Tuple
3 | from urllib.parse import urljoin
4 |
5 | import librosa
6 | import numpy as np
7 |
8 | from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL
9 |
10 | ASSET_DIR = "multimodal_asset"
11 |
12 |
13 | @dataclass(frozen=True)
14 | class AudioAsset:
15 | name: Literal["winning_call", "mary_had_lamb"]
16 |
17 | @property
18 | def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
19 |
20 | audio_path = get_vllm_public_assets(
21 | filename=f"{self.name}.ogg", s3_prefix=ASSET_DIR
22 | )
23 | y, sr = librosa.load(audio_path, sr=None)
24 | assert isinstance(sr, int)
25 | return y, sr
26 |
27 | @property
28 | def url(self) -> str:
29 | return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
30 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/assets/base.py:
--------------------------------------------------------------------------------
1 | from functools import lru_cache
2 | from pathlib import Path
3 | from typing import Optional
4 |
5 | import vllm.envs as envs
6 | from vllm.connections import global_http_connection
7 | from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
8 |
9 | vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
10 |
11 |
12 | def get_cache_dir() -> Path:
13 | """Get the path to the cache for storing downloaded assets."""
14 | path = Path(envs.VLLM_ASSETS_CACHE)
15 | path.mkdir(parents=True, exist_ok=True)
16 |
17 | return path
18 |
19 |
20 | @lru_cache
21 | def get_vllm_public_assets(filename: str, s3_prefix: Optional[str] = None) -> Path:
22 | """
23 | Download an asset file from ``s3://vllm-public-assets``
24 | and return the path to the downloaded file.
25 | """
26 | asset_directory = get_cache_dir() / "vllm_public_assets"
27 | asset_directory.mkdir(parents=True, exist_ok=True)
28 |
29 | asset_path = asset_directory / filename
30 | if not asset_path.exists():
31 | if s3_prefix is not None:
32 | filename = s3_prefix + "/" + filename
33 | global_http_connection.download_file(
34 | f"{vLLM_S3_BUCKET_URL}/{filename}",
35 | asset_path,
36 | timeout=VLLM_IMAGE_FETCH_TIMEOUT,
37 | )
38 |
39 | return asset_path
40 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/assets/image.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Literal
3 |
4 | import torch
5 | from PIL import Image
6 |
7 | from vllm.assets.base import get_vllm_public_assets
8 |
9 | VLM_IMAGES_DIR = "vision_model_images"
10 |
11 |
12 | @dataclass(frozen=True)
13 | class ImageAsset:
14 | name: Literal["stop_sign", "cherry_blossom"]
15 |
16 | @property
17 | def pil_image(self) -> Image.Image:
18 |
19 | image_path = get_vllm_public_assets(
20 | filename=f"{self.name}.jpg", s3_prefix=VLM_IMAGES_DIR
21 | )
22 | return Image.open(image_path)
23 |
24 | @property
25 | def image_embeds(self) -> torch.Tensor:
26 | """
27 | Image embeddings, only used for testing purposes with llava 1.5.
28 | """
29 | image_path = get_vllm_public_assets(
30 | filename=f"{self.name}.pt", s3_prefix=VLM_IMAGES_DIR
31 | )
32 | return torch.load(image_path)
33 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/attention/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.attention.backends.abstract import (
2 | AttentionBackend,
3 | AttentionMetadata,
4 | AttentionMetadataBuilder,
5 | AttentionState,
6 | AttentionType,
7 | )
8 | from vllm.attention.layer import Attention
9 | from vllm.attention.selector import get_attn_backend
10 |
11 | __all__ = [
12 | "Attention",
13 | "AttentionBackend",
14 | "AttentionMetadata",
15 | "AttentionType",
16 | "AttentionMetadataBuilder",
17 | "Attention",
18 | "AttentionState",
19 | "get_attn_backend",
20 | ]
21 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/attention/backends/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/attention/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/attention/ops/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/attention/ops/blocksparse_attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/attention/ops/blocksparse_attention/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/compilation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/compilation/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/core/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/core/block/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .communication_op import *
2 | from .parallel_state import *
3 | from .utils import *
4 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/distributed/communication_op.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, Optional, Union
2 |
3 | import torch
4 | import torch.distributed
5 |
6 | from .parallel_state import get_tp_group
7 |
8 |
9 | def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
10 | """All-reduce the input tensor across model parallel group."""
11 | return get_tp_group().all_reduce(input_)
12 |
13 |
14 | def tensor_model_parallel_all_gather(
15 | input_: torch.Tensor, dim: int = -1
16 | ) -> torch.Tensor:
17 | """All-gather the input tensor across model parallel group."""
18 | return get_tp_group().all_gather(input_, dim)
19 |
20 |
21 | def tensor_model_parallel_gather(
22 | input_: torch.Tensor, dst: int = 0, dim: int = -1
23 | ) -> Optional[torch.Tensor]:
24 | """Gather the input tensor across model parallel group."""
25 | return get_tp_group().gather(input_, dst, dim)
26 |
27 |
28 | def broadcast_tensor_dict(
29 | tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0
30 | ):
31 | if not torch.distributed.is_initialized():
32 | return tensor_dict
33 | return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
34 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/distributed/device_communicators/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/engine/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/engine/output_processor/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/engine/output_processor/util.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from typing import Sequence as GenericSequence
3 | from typing import Union
4 |
5 | from vllm.model_executor.layers.sampler import SamplerOutput
6 | from vllm.sequence import PoolerOutput, SequenceGroupOutput
7 |
8 |
9 | def create_output_by_sequence_group(
10 | outputs: GenericSequence[Union[SamplerOutput, PoolerOutput]], num_seq_groups: int
11 | ) -> List[List[SequenceGroupOutput]]:
12 | """Helper method which transforms a 2d list organized by
13 | [step][sequence group] into [sequence group][step].
14 | """
15 | output_by_sequence_group: List[List[SequenceGroupOutput]] = [
16 | [] for _ in range(num_seq_groups)
17 | ]
18 | for step in outputs:
19 | for i, sequence_group_output in enumerate(step):
20 | output_by_sequence_group[i].append(sequence_group_output)
21 |
22 | return output_by_sequence_group
23 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/entrypoints/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/entrypoints/openai/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/entrypoints/openai/tool_parsers/__init__.py:
--------------------------------------------------------------------------------
1 | from .abstract_tool_parser import ToolParser, ToolParserManager
2 | from .hermes_tool_parser import Hermes2ProToolParser
3 | from .internlm2_tool_parser import Internlm2ToolParser
4 | from .llama_tool_parser import Llama3JsonToolParser
5 | from .mistral_tool_parser import MistralToolParser
6 |
7 | __all__ = [
8 | "ToolParser",
9 | "ToolParserManager",
10 | "Hermes2ProToolParser",
11 | "MistralToolParser",
12 | "Internlm2ToolParser",
13 | "Llama3JsonToolParser",
14 | ]
15 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/executor/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/executor/msgspec_utils.py:
--------------------------------------------------------------------------------
1 | from array import array
2 | from typing import Any, Type
3 |
4 | from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
5 |
6 |
7 | def encode_hook(obj: Any) -> Any:
8 | """Custom msgspec enc hook that supports array types.
9 |
10 | See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
11 | """
12 | if isinstance(obj, array):
13 | assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
14 | f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
15 | f"Given array has a type code of {obj.typecode}."
16 | )
17 | return obj.tobytes()
18 |
19 |
20 | def decode_hook(type: Type, obj: Any) -> Any:
21 | """Custom msgspec dec hook that supports array types.
22 |
23 | See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
24 | """
25 | if type is array:
26 | deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
27 | deserialized.frombytes(obj)
28 | return deserialized
29 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/executor/multiproc_xpu_executor.py:
--------------------------------------------------------------------------------
1 | import vllm.envs as envs
2 | from vllm.executor.multiproc_gpu_executor import (
3 | MultiprocessingGPUExecutor,
4 | MultiprocessingGPUExecutorAsync,
5 | )
6 | from vllm.executor.xpu_executor import XPUExecutor
7 | from vllm.logger import init_logger
8 | from vllm.utils import make_async
9 |
10 | logger = init_logger(__name__)
11 |
12 |
13 | class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor):
14 | """Python multiprocessing-based multi-XPU executor"""
15 |
16 | def _check_executor_parameters(self):
17 | mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
18 | if mp_method != "spawn":
19 | raise RuntimeError(
20 | "XPU multiprocess executor only support spawn as mp method"
21 | )
22 |
23 |
24 | class MultiprocessingXPUExecutorAsync(
25 | MultiprocessingXPUExecutor, MultiprocessingGPUExecutorAsync
26 | ):
27 |
28 | def __init__(self, *args, **kwargs):
29 | super().__init__(*args, **kwargs)
30 | self.driver_exec_model = make_async(self.driver_worker.execute_model)
31 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/executor/ray_xpu_executor.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from typing import List, Optional
3 |
4 | import vllm.envs as envs
5 | from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync
6 | from vllm.executor.xpu_executor import XPUExecutor
7 | from vllm.logger import init_logger
8 | from vllm.utils import get_vllm_instance_id, make_async
9 |
10 | logger = init_logger(__name__)
11 |
12 |
13 | class RayXPUExecutor(RayGPUExecutor, XPUExecutor):
14 |
15 | def _get_env_vars_to_be_updated(self):
16 | # Get the set of GPU IDs used on each node.
17 | worker_node_and_gpu_ids = self._run_workers(
18 | "get_node_and_gpu_ids", use_dummy_driver=True
19 | )
20 |
21 | VLLM_INSTANCE_ID = get_vllm_instance_id()
22 |
23 | # Set environment variables for the driver and workers.
24 | all_args_to_update_environment_variables = [
25 | (
26 | {
27 | "VLLM_INSTANCE_ID": VLLM_INSTANCE_ID,
28 | "VLLM_TRACE_FUNCTION": str(envs.VLLM_TRACE_FUNCTION),
29 | },
30 | )
31 | for (_, _) in worker_node_and_gpu_ids
32 | ]
33 | return all_args_to_update_environment_variables
34 |
35 |
36 | class RayXPUExecutorAsync(RayXPUExecutor, RayGPUExecutorAsync):
37 |
38 | def __init__(self, *args, **kwargs):
39 | super().__init__(*args, **kwargs)
40 | self.driver_exec_method = make_async(self.driver_worker.execute_method)
41 | self.pp_locks: Optional[List[asyncio.Lock]] = None
42 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/forward_context.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | from typing import Any
3 |
4 | _forward_context: Any = None
5 |
6 |
7 | def get_forward_context() -> Any:
8 | """Get the current forward context."""
9 | return _forward_context
10 |
11 |
12 | @contextmanager
13 | def set_forward_context(context: Any):
14 | """A context manager that stores the current forward context,
15 | can be attention metadata, etc."""
16 | global _forward_context
17 | prev_context = _forward_context
18 | _forward_context = context
19 | try:
20 | yield
21 | finally:
22 | _forward_context = prev_context
23 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/inputs/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import (
2 | EncoderDecoderLLMInputs,
3 | ExplicitEncoderDecoderPrompt,
4 | LLMInputs,
5 | PromptType,
6 | SingletonPrompt,
7 | TextPrompt,
8 | TokensPrompt,
9 | build_explicit_enc_dec_prompt,
10 | to_enc_dec_tuple_list,
11 | zip_enc_dec_prompts,
12 | )
13 | from .registry import InputContext, InputRegistry
14 |
15 | INPUT_REGISTRY = InputRegistry()
16 | """
17 | The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
18 | to dispatch data processing according to the target model.
19 |
20 | See also:
21 | :ref:`input_processing_pipeline`
22 | """
23 |
24 | __all__ = [
25 | "TextPrompt",
26 | "TokensPrompt",
27 | "PromptType",
28 | "SingletonPrompt",
29 | "ExplicitEncoderDecoderPrompt",
30 | "LLMInputs",
31 | "EncoderDecoderLLMInputs",
32 | "build_explicit_enc_dec_prompt",
33 | "to_enc_dec_tuple_list",
34 | "zip_enc_dec_prompts",
35 | "INPUT_REGISTRY",
36 | "InputContext",
37 | "InputRegistry",
38 | ]
39 |
40 |
41 | def __getattr__(name: str):
42 | if name == "PromptInput":
43 | import warnings
44 |
45 | msg = (
46 | "PromptInput has been renamed to PromptType. "
47 | "The original name will be removed in an upcoming version."
48 | )
49 |
50 | warnings.warn(DeprecationWarning(msg), stacklevel=2)
51 |
52 | return PromptType
53 |
54 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
55 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/logging/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.logging.formatter import NewLineFormatter
2 |
3 | __all__ = [
4 | "NewLineFormatter",
5 | ]
6 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/logging/formatter.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 |
4 | class NewLineFormatter(logging.Formatter):
5 | """Adds logging prefix to newlines to align multi-line messages."""
6 |
7 | def __init__(self, fmt, datefmt=None, style="%"):
8 | logging.Formatter.__init__(self, fmt, datefmt, style)
9 |
10 | def format(self, record):
11 | msg = logging.Formatter.format(self, record)
12 | if record.message != "":
13 | parts = msg.split(record.message)
14 | msg = msg.replace("\n", "\r\n" + parts[0])
15 | return msg
16 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/lora/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/lora/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/lora/ops/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.model_executor.parameter import BasevLLMParameter, PackedvLLMParameter
2 | from vllm.model_executor.sampling_metadata import (
3 | SamplingMetadata,
4 | SamplingMetadataCache,
5 | )
6 | from vllm.model_executor.utils import set_random_seed
7 |
8 | __all__ = [
9 | "SamplingMetadata",
10 | "SamplingMetadataCache",
11 | "set_random_seed",
12 | "BasevLLMParameter",
13 | "PackedvLLMParameter",
14 | ]
15 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/fused_moe/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.model_executor.layers.fused_moe.layer import (
2 | FusedMoE,
3 | FusedMoEMethodBase,
4 | FusedMoeWeightScaleSupported,
5 | )
6 | from vllm.triton_utils import HAS_TRITON
7 |
8 | __all__ = [
9 | "FusedMoE",
10 | "FusedMoEMethodBase",
11 | "FusedMoeWeightScaleSupported",
12 | ]
13 |
14 | if HAS_TRITON:
15 | from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
16 | fused_marlin_moe,
17 | single_marlin_moe,
18 | )
19 | from vllm.model_executor.layers.fused_moe.fused_moe import (
20 | fused_experts,
21 | fused_moe,
22 | fused_topk,
23 | get_config_file_name,
24 | grouped_topk,
25 | )
26 |
27 | __all__ += [
28 | "fused_marlin_moe",
29 | "single_marlin_moe",
30 | "fused_moe",
31 | "fused_topk",
32 | "fused_experts",
33 | "get_config_file_name",
34 | "grouped_topk",
35 | ]
36 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
2 | For different settings of
3 | - E (number of experts)
4 | - N (intermediate size)
5 | - device_name (torch.cuda.get_device_name())
6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
7 |
8 | The example configurations provided are for the Mixtral model for TP2 on H100
9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/mamba/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/mamba/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/mamba/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/mamba/ops/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py:
--------------------------------------------------------------------------------
1 | from .compressed_tensors_scheme import CompressedTensorsScheme
2 | from .compressed_tensors_w4a16_24 import (
3 | W4A16SPARSE24_SUPPORTED_BITS,
4 | CompressedTensorsW4A16Sparse24,
5 | )
6 | from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
7 | from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
8 | from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
9 | from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16
10 |
11 | __all__ = [
12 | "CompressedTensorsScheme",
13 | "CompressedTensorsWNA16",
14 | "CompressedTensorsW8A16Fp8",
15 | "CompressedTensorsW4A16Sparse24",
16 | "CompressedTensorsW8A8Int8",
17 | "CompressedTensorsW8A8Fp8",
18 | "WNA16_SUPPORTED_BITS",
19 | "W4A16SPARSE24_SUPPORTED_BITS",
20 | ]
21 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/quantization/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .layer_utils import replace_parameter, update_tensor_inplace
2 |
3 | __all__ = ["update_tensor_inplace", "replace_parameter"]
4 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/quantization/utils/machete_utils.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Tuple
2 |
3 | import torch
4 |
5 | from vllm.scalar_type import ScalarType, scalar_types
6 |
7 | MACHETE_SUPPORTED_GROUP_SIZES = [-1, 128]
8 | MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128]
9 |
10 |
11 | def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]:
12 | if zero_points:
13 | return [scalar_types.uint4, scalar_types.uint8]
14 | else:
15 | return [scalar_types.uint4b8, scalar_types.uint8b128]
16 |
17 |
18 | def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]:
19 | return [torch.float16, torch.bfloat16]
20 |
21 |
22 | def check_machete_supports_shape(
23 | in_features: int, out_featrues: int
24 | ) -> Tuple[bool, Optional[str]]:
25 | if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:
26 | return (
27 | False,
28 | "Input features size must be divisible by "
29 | f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}",
30 | )
31 | if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0:
32 | return (
33 | False,
34 | "Output features size must be divisible by "
35 | f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}",
36 | )
37 | return True, None
38 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/model_loader/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from torch import nn
4 |
5 | from vllm.config import (
6 | CacheConfig,
7 | DeviceConfig,
8 | LoadConfig,
9 | LoRAConfig,
10 | ModelConfig,
11 | ParallelConfig,
12 | SchedulerConfig,
13 | )
14 | from vllm.model_executor.model_loader.loader import BaseModelLoader, get_model_loader
15 | from vllm.model_executor.model_loader.utils import (
16 | get_architecture_class_name,
17 | get_model_architecture,
18 | )
19 |
20 |
21 | def get_model(
22 | *,
23 | model_config: ModelConfig,
24 | load_config: LoadConfig,
25 | device_config: DeviceConfig,
26 | parallel_config: ParallelConfig,
27 | scheduler_config: SchedulerConfig,
28 | lora_config: Optional[LoRAConfig],
29 | cache_config: CacheConfig
30 | ) -> nn.Module:
31 | loader = get_model_loader(load_config)
32 | return loader.load_model(
33 | model_config=model_config,
34 | device_config=device_config,
35 | lora_config=lora_config,
36 | parallel_config=parallel_config,
37 | scheduler_config=scheduler_config,
38 | cache_config=cache_config,
39 | )
40 |
41 |
42 | __all__ = [
43 | "get_model",
44 | "get_model_loader",
45 | "BaseModelLoader",
46 | "get_architecture_class_name",
47 | "get_model_architecture",
48 | ]
49 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/model_loader/utils.py:
--------------------------------------------------------------------------------
1 | """Utilities for selecting and loading models."""
2 |
3 | import contextlib
4 | from typing import Tuple, Type
5 |
6 | import torch
7 | from torch import nn
8 |
9 | from vllm.config import ModelConfig
10 | from vllm.model_executor.models import ModelRegistry
11 |
12 |
13 | @contextlib.contextmanager
14 | def set_default_torch_dtype(dtype: torch.dtype):
15 | """Sets the default torch dtype to the given dtype."""
16 | old_dtype = torch.get_default_dtype()
17 | torch.set_default_dtype(dtype)
18 | yield
19 | torch.set_default_dtype(old_dtype)
20 |
21 |
22 | def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
23 | architectures = getattr(model_config.hf_config, "architectures", [])
24 | # Special handling for quantized Mixtral.
25 | # FIXME(woosuk): This is a temporary hack.
26 | mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"]
27 |
28 | if (
29 | model_config.quantization is not None
30 | and model_config.quantization not in mixtral_supported
31 | and "MixtralForCausalLM" in architectures
32 | ):
33 | architectures = ["QuantMixtralForCausalLM"]
34 |
35 | return ModelRegistry.resolve_model_cls(architectures)
36 |
37 |
38 | def get_architecture_class_name(model_config: ModelConfig) -> str:
39 | return get_model_architecture(model_config)[1]
40 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .interfaces import (
2 | HasInnerState,
3 | SupportsLoRA,
4 | SupportsMultiModal,
5 | SupportsPP,
6 | has_inner_state,
7 | supports_lora,
8 | supports_multimodal,
9 | supports_pp,
10 | )
11 | from .interfaces_base import (
12 | VllmModelForEmbedding,
13 | VllmModelForTextGeneration,
14 | is_embedding_model,
15 | is_text_generation_model,
16 | )
17 | from .registry import ModelRegistry
18 |
19 | __all__ = [
20 | "ModelRegistry",
21 | "VllmModelForEmbedding",
22 | "is_embedding_model",
23 | "VllmModelForTextGeneration",
24 | "is_text_generation_model",
25 | "HasInnerState",
26 | "has_inner_state",
27 | "SupportsLoRA",
28 | "supports_lora",
29 | "SupportsMultiModal",
30 | "supports_multimodal",
31 | "SupportsPP",
32 | "supports_pp",
33 | ]
34 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/models/phi3.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Adapted from llama.py
3 | """Inference-only Phi3 model code inherit from Llama.py"""
4 |
5 | from vllm.model_executor.models.llama import LlamaForCausalLM
6 |
7 |
8 | class Phi3ForCausalLM(LlamaForCausalLM):
9 |
10 | packed_modules_mapping = {
11 | "qkv_proj": [
12 | "qkv_proj",
13 | ],
14 | "gate_up_proj": [
15 | "gate_up_proj",
16 | ],
17 | }
18 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/utils.py:
--------------------------------------------------------------------------------
1 | """Utils for model executor."""
2 |
3 | from typing import Any, Dict, Optional
4 |
5 | import torch
6 |
7 | from vllm.utils import seed_everything
8 |
9 |
10 | def set_random_seed(seed: int) -> None:
11 | seed_everything(seed)
12 |
13 |
14 | def set_weight_attrs(
15 | weight: torch.Tensor,
16 | weight_attrs: Optional[Dict[str, Any]],
17 | ):
18 | """Set attributes on a weight tensor.
19 |
20 | This method is used to set attributes on a weight tensor. This method
21 | will not overwrite existing attributes.
22 |
23 | Args:
24 | weight: The weight tensor.
25 | weight_attrs: A dictionary of attributes to set on the weight tensor.
26 | """
27 | if weight_attrs is None:
28 | return
29 | for key, value in weight_attrs.items():
30 | assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}"
31 | setattr(weight, key, value)
32 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import (
2 | BatchedTensorInputs,
3 | MultiModalDataBuiltins,
4 | MultiModalDataDict,
5 | MultiModalInputs,
6 | MultiModalPlugin,
7 | NestedTensors,
8 | )
9 | from .registry import MultiModalRegistry
10 |
11 | MULTIMODAL_REGISTRY = MultiModalRegistry()
12 | """
13 | The global :class:`~MultiModalRegistry` is used by model runners to
14 | dispatch data processing according to its modality and the target model.
15 |
16 | See also:
17 | :ref:`input_processing_pipeline`
18 | """
19 |
20 | __all__ = [
21 | "BatchedTensorInputs",
22 | "MultiModalDataBuiltins",
23 | "MultiModalDataDict",
24 | "MultiModalInputs",
25 | "MultiModalPlugin",
26 | "NestedTensors",
27 | "MULTIMODAL_REGISTRY",
28 | "MultiModalRegistry",
29 | ]
30 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/multimodal/audio.py:
--------------------------------------------------------------------------------
1 | from vllm.inputs.registry import InputContext
2 | from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin
3 |
4 |
5 | class AudioPlugin(MultiModalPlugin):
6 | """Plugin for audio data."""
7 |
8 | def get_data_key(self) -> str:
9 | return "audio"
10 |
11 | def _default_input_mapper(
12 | self, ctx: InputContext, data: object, **mm_processor_kwargs
13 | ) -> MultiModalInputs:
14 | raise NotImplementedError("There is no default audio input mapper")
15 |
16 | def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
17 | raise NotImplementedError("There is no default maximum multimodal tokens")
18 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/platforms/cpu.py:
--------------------------------------------------------------------------------
1 | import psutil
2 | import torch
3 |
4 | from .interface import Platform, PlatformEnum
5 |
6 |
7 | class CpuPlatform(Platform):
8 | _enum = PlatformEnum.CPU
9 |
10 | @classmethod
11 | def get_device_name(cls, device_id: int = 0) -> str:
12 | return "cpu"
13 |
14 | @classmethod
15 | def get_device_total_memory(cls, device_id: int = 0) -> int:
16 | return psutil.virtual_memory().total
17 |
18 | @classmethod
19 | def inference_mode(cls):
20 | return torch.no_grad()
21 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/platforms/rocm.py:
--------------------------------------------------------------------------------
1 | import os
2 | from functools import lru_cache
3 |
4 | import torch
5 |
6 | from vllm.logger import init_logger
7 |
8 | from .interface import DeviceCapability, Platform, PlatformEnum
9 |
10 | logger = init_logger(__name__)
11 |
12 | if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
13 | logger.warning(
14 | "`fork` method is not supported by ROCm. "
15 | "VLLM_WORKER_MULTIPROC_METHOD is overridden to"
16 | " `spawn` instead."
17 | )
18 | os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
19 |
20 |
21 | class RocmPlatform(Platform):
22 | _enum = PlatformEnum.ROCM
23 |
24 | @classmethod
25 | @lru_cache(maxsize=8)
26 | def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
27 | major, minor = torch.cuda.get_device_capability(device_id)
28 | return DeviceCapability(major=major, minor=minor)
29 |
30 | @classmethod
31 | @lru_cache(maxsize=8)
32 | def get_device_name(cls, device_id: int = 0) -> str:
33 | return torch.cuda.get_device_name(device_id)
34 |
35 | @classmethod
36 | def get_device_total_memory(cls, device_id: int = 0) -> int:
37 | device_props = torch.cuda.get_device_properties(device_id)
38 | return device_props.total_memory
39 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/platforms/tpu.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from .interface import Platform, PlatformEnum
4 |
5 |
6 | class TpuPlatform(Platform):
7 | _enum = PlatformEnum.TPU
8 |
9 | @classmethod
10 | def get_device_name(cls, device_id: int = 0) -> str:
11 | raise NotImplementedError
12 |
13 | @classmethod
14 | def get_device_total_memory(cls, device_id: int = 0) -> int:
15 | raise NotImplementedError
16 |
17 | @classmethod
18 | def inference_mode(cls):
19 | return torch.no_grad()
20 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/platforms/xpu.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from .interface import DeviceCapability, Platform, PlatformEnum
4 |
5 |
6 | class XPUPlatform(Platform):
7 | _enum = PlatformEnum.XPU
8 |
9 | @staticmethod
10 | def get_device_capability(device_id: int = 0) -> DeviceCapability:
11 | major, minor, *_ = torch.xpu.get_device_capability(device_id)["version"].split(
12 | "."
13 | )
14 | return DeviceCapability(major=int(major), minor=int(minor))
15 |
16 | @staticmethod
17 | def get_device_name(device_id: int = 0) -> str:
18 | return torch.xpu.get_device_name(device_id)
19 |
20 | @classmethod
21 | def get_device_total_memory(cls, device_id: int = 0) -> int:
22 | device_props = torch.xpu.get_device_properties(device_id)
23 | return device_props.total_memory
24 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/plugins/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import Callable, Optional, Union
3 |
4 | import vllm.envs as envs
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 |
9 | def load_general_plugins():
10 | """WARNING: plugins can be loaded for multiple times in different
11 | processes. They should be designed in a way that they can be loaded
12 | multiple times without causing issues.
13 | """
14 | import sys
15 |
16 | if sys.version_info < (3, 10):
17 | from importlib_metadata import entry_points
18 | else:
19 | from importlib.metadata import entry_points
20 |
21 | allowed_plugins = envs.VLLM_PLUGINS
22 |
23 | discovered_plugins = entry_points(group="vllm.general_plugins")
24 | for plugin in discovered_plugins:
25 | logger.info("Found general plugin: %s", plugin.name)
26 | if allowed_plugins is None or plugin.name in allowed_plugins:
27 | try:
28 | func = plugin.load()
29 | func()
30 | logger.info("Loaded general plugin: %s", plugin.name)
31 | except Exception:
32 | logger.exception("Failed to load general plugin: %s", plugin.name)
33 |
34 |
35 | _torch_compile_backend: Optional[Union[Callable, str]] = None
36 |
37 |
38 | def set_torch_compile_backend(backend: Union[Callable, str]):
39 | global _torch_compile_backend
40 | _torch_compile_backend = backend
41 |
42 |
43 | def get_torch_compile_backend() -> Optional[Union[Callable, str]]:
44 | return _torch_compile_backend
45 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/pooling_params.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional
2 |
3 | import msgspec
4 |
5 |
6 | class PoolingParams(
7 | msgspec.Struct, omit_defaults=True, array_like=True # type: ignore[call-arg]
8 | ): # type: ignore[call-arg]
9 | """Pooling parameters for pooling.
10 |
11 | Attributes:
12 | additional_data: Any additional data needed for pooling.
13 | """
14 |
15 | additional_data: Optional[Any] = None
16 |
17 | def clone(self) -> "PoolingParams":
18 | """Returns a deep copy of the PoolingParams instance."""
19 | return PoolingParams(
20 | additional_data=self.additional_data,
21 | )
22 |
23 | def __repr__(self) -> str:
24 | return f"PoolingParams(" f"additional_metadata={self.additional_data})"
25 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/prompt_adapter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/prompt_adapter/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/prompt_adapter/request.py:
--------------------------------------------------------------------------------
1 | import msgspec
2 |
3 | from vllm.adapter_commons.request import AdapterRequest
4 |
5 |
6 | class PromptAdapterRequest(
7 | msgspec.Struct,
8 | array_like=True, # type: ignore[call-arg]
9 | omit_defaults=True, # type: ignore[call-arg]
10 | frozen=True,
11 | ): # type: ignore[call-arg]
12 | """
13 | Request for a Prompt adapter.
14 | """
15 |
16 | __metaclass__ = AdapterRequest
17 |
18 | prompt_adapter_name: str
19 | prompt_adapter_id: int
20 | prompt_adapter_local_path: str
21 | prompt_adapter_num_virtual_tokens: int
22 |
23 | def __hash__(self):
24 | return super().__hash__()
25 |
26 | @property
27 | def adapter_id(self):
28 | return self.prompt_adapter_id
29 |
30 | @property
31 | def name(self):
32 | return self.prompt_adapter_name
33 |
34 | @property
35 | def local_path(self):
36 | return self.prompt_adapter_local_path
37 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/scalar_type.py:
--------------------------------------------------------------------------------
1 | from ._core_ext import NanRepr, ScalarType
2 |
3 | # naming generally follows: https://github.com/jax-ml/ml_dtypes
4 | # for floating point types (leading f) the scheme is:
5 | # `float_em[flags]`
6 | # flags:
7 | # - no-flags: means it follows IEEE 754 conventions
8 | # - f: means finite values only (no infinities)
9 | # - n: means nans are supported (non-standard encoding)
10 | # for integer types the scheme is:
11 | # `[u]int[b]`
12 | # - if bias is not present it means its zero
13 |
14 |
15 | class scalar_types:
16 | int4 = ScalarType.int_(4, None)
17 | uint4 = ScalarType.uint(4, None)
18 | int8 = ScalarType.int_(8, None)
19 | uint8 = ScalarType.uint(8, None)
20 | float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN.value)
21 | float8_e5m2 = ScalarType.float_IEEE754(5, 2)
22 | float16_e8m7 = ScalarType.float_IEEE754(8, 7)
23 | float16_e5m10 = ScalarType.float_IEEE754(5, 10)
24 |
25 | # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
26 | float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE.value)
27 |
28 | # "gptq" types
29 | uint4b8 = ScalarType.uint(4, 8)
30 | uint8b128 = ScalarType.uint(8, 128)
31 |
32 | # colloquial names
33 | bfloat16 = float16_e8m7
34 | float16 = float16_e5m10
35 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/spec_decode/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/tokenformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/tokenformer/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.envs import VLLM_USE_MODELSCOPE
2 |
3 | if VLLM_USE_MODELSCOPE:
4 | # Patch here, before each import happens
5 | import modelscope
6 | from packaging import version
7 |
8 | # patch_hub begins from modelscope>=1.18.1
9 | if version.parse(modelscope.__version__) <= version.parse("1.18.0"):
10 | raise ImportError(
11 | "Using vLLM with ModelScope needs modelscope>=1.18.1, please "
12 | "install by `pip install modelscope>=1.18.1`"
13 | )
14 |
15 | from modelscope.utils.hf_util import patch_hub
16 |
17 | # Patch hub to download models from modelscope to speed up.
18 | patch_hub()
19 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/transformers_utils/configs/mllama.py:
--------------------------------------------------------------------------------
1 | from transformers.models.mllama import configuration_mllama as mllama_hf_config
2 |
3 |
4 | class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
5 | """
6 | Use this class to override is_encoder_decoder:
7 | - transformers regards mllama as is_encoder_decoder=False
8 | - vllm needs is_encoder_decoder=True to enable cross-attention
9 | """
10 |
11 | def __init__(
12 | self,
13 | **kwargs,
14 | ):
15 | super().__init__(**kwargs)
16 | self.is_encoder_decoder = True
17 |
18 |
19 | class MllamaConfig(mllama_hf_config.MllamaConfig):
20 |
21 | def __init__(
22 | self,
23 | text_config=None,
24 | **kwargs,
25 | ):
26 | if isinstance(text_config, dict):
27 | text_config = MllamaTextConfig(**text_config)
28 | super().__init__(text_config=text_config, **kwargs)
29 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/transformers_utils/configs/nvlm_d.py:
--------------------------------------------------------------------------------
1 | # Adapted from
2 | # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
3 | # --------------------------------------------------------
4 | # NVLM-D
5 | # Copyright (c) 2024 NVIDIA
6 | # Licensed under Apache 2.0 License [see LICENSE for details]
7 | # --------------------------------------------------------
8 | from .internvl import InternVLChatConfig
9 |
10 |
11 | class NVLM_D_Config(InternVLChatConfig):
12 | model_type = "NVLM_D"
13 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .mistral import MistralTokenizer
2 |
3 | __all__ = ["MistralTokenizer"]
4 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/transformers_utils/utils.py:
--------------------------------------------------------------------------------
1 | from os import PathLike
2 | from pathlib import Path
3 | from typing import Union
4 |
5 |
6 | def check_gguf_file(model: Union[str, PathLike]) -> bool:
7 | """Check if the file is a GGUF model."""
8 | model = Path(model)
9 | if not model.is_file():
10 | return False
11 | elif model.suffix == ".gguf":
12 | return True
13 |
14 | with open(model, "rb") as f:
15 | header = f.read(4)
16 | return header == b"GGUF"
17 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/triton_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.triton_utils.importing import HAS_TRITON
2 |
3 | __all__ = ["HAS_TRITON"]
4 |
5 | if HAS_TRITON:
6 |
7 | from vllm.triton_utils.custom_cache_manager import maybe_set_triton_cache_manager
8 | from vllm.triton_utils.libentry import libentry
9 |
10 | __all__ += ["maybe_set_triton_cache_manager", "libentry"]
11 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/triton_utils/importing.py:
--------------------------------------------------------------------------------
1 | from importlib.util import find_spec
2 |
3 | from vllm.logger import init_logger
4 |
5 | logger = init_logger(__name__)
6 |
7 | HAS_TRITON = find_spec("triton") is not None
8 |
9 | if not HAS_TRITON:
10 | logger.info(
11 | "Triton not installed; certain GPU-related functions" " will not be available."
12 | )
13 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/usage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/usage/__init__.py
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/version.py:
--------------------------------------------------------------------------------
1 | try:
2 | from ._version import __version__, __version_tuple__
3 | except Exception as e:
4 | import warnings
5 |
6 | warnings.warn(f"Failed to read commit hash:\n{e}", RuntimeWarning, stacklevel=2)
7 |
8 | __version__ = "dev"
9 | __version_tuple__ = (0, 0, __version__)
10 |
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/vllm_flash_attn/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/vllm_flash_attn/.gitkeep
--------------------------------------------------------------------------------
/infra/cray_infra/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/worker/__init__.py
--------------------------------------------------------------------------------
/infra/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | #include "dtype_fp8.cuh"
8 |
--------------------------------------------------------------------------------
/infra/csrc/attention/dtype_fp8.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "attention_generic.cuh"
4 |
5 | #include
6 | #ifdef ENABLE_FP8
7 | #ifndef USE_ROCM
8 | #include
9 | #endif // USE_ROCM
10 | #endif // ENABLE_FP8
11 |
12 | namespace vllm {
13 |
14 | enum class Fp8KVCacheDataType {
15 | kAuto = 0,
16 | kFp8E4M3 = 1,
17 | kFp8E5M2 = 2,
18 | };
19 |
20 | // fp8 vector types for quantization of kv cache
21 | template <>
22 | struct Vec {
23 | using Type = uint8_t;
24 | };
25 |
26 | template <>
27 | struct Vec {
28 | using Type = uint16_t;
29 | };
30 |
31 | template <>
32 | struct Vec {
33 | using Type = uint32_t;
34 | };
35 |
36 | template <>
37 | struct Vec {
38 | using Type = uint2;
39 | };
40 |
41 | } // namespace vllm
42 |
--------------------------------------------------------------------------------
/infra/csrc/cache.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include