├── .github └── workflows │ ├── depot-amd.yml │ ├── depot-cpu.yml │ ├── depot-nvidia-8.0.yml │ ├── depot-nvidia-8.6.yml │ ├── depot-nvidia.yml │ └── unit-tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── Faq.md ├── LICENSE ├── README.md ├── cmd ├── bashly-settings.yml ├── bashly.sh ├── bashly.yml ├── benchmark_command.sh ├── build_image_command.sh ├── depot_build_command.sh ├── lib │ └── colors.sh ├── llm_logs_command.sh ├── llm_ls_command.sh ├── llm_plot_command.sh ├── llm_squeue_command.sh ├── pypi_command.sh ├── test_command.sh └── up_command.sh ├── deployment ├── ansible │ ├── hosts │ └── k8.yaml └── helm │ ├── amd_multi_node │ └── scalarlm │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── _helpers.tpl │ │ ├── api_configmap.yaml │ │ ├── api_deployment.yaml │ │ ├── api_service.yaml │ │ ├── vllm_configmap.yaml │ │ ├── vllm_deployment.yaml │ │ └── vllm_service.yaml │ │ └── values.yaml │ ├── amd_single_node │ └── scalarlm │ │ ├── Chart.yaml │ │ ├── local-hostpath-sc.yaml │ │ ├── storageclass-clusterrole.yaml │ │ ├── templates │ │ ├── _helpers.tpl │ │ ├── api_configmap.yaml │ │ ├── api_deployment.yaml │ │ ├── api_service.yaml │ │ ├── cache_pvc.yaml │ │ ├── jobs_pvc.yaml │ │ ├── vllm_configmap.yaml │ │ ├── vllm_deployment.yaml │ │ └── vllm_service.yaml │ │ └── values.yaml │ ├── amd_single_pod │ └── scalarlm │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── _helpers.tpl │ │ ├── configmap.yaml │ │ ├── deployment.yaml │ │ └── service.yaml │ │ └── values.yaml │ ├── cray │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── NOTES.txt │ │ ├── _helpers.tpl │ │ ├── deployment.yaml │ │ ├── hpa.yaml │ │ ├── ingress.yaml │ │ ├── service.yaml │ │ ├── serviceaccount.yaml │ │ └── tests │ │ │ └── test-connection.yaml │ └── values.yaml │ ├── lambda │ └── cray │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── _helpers.tpl │ │ ├── configmap.yaml │ │ ├── deployment.yaml │ │ └── service.yaml │ │ └── values.yaml │ ├── minikube │ └── cray │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── _helpers.tpl │ │ ├── deployment.yaml │ │ └── service.yaml │ │ └── values.yaml │ ├── tensorwave3b │ └── scalarlm │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── _helpers.tpl │ │ ├── api_configmap.yaml │ │ ├── api_deployment.yaml │ │ ├── api_service.yaml │ │ ├── cache_pvc.yaml │ │ ├── jobs_pvc.yaml │ │ ├── vllm_configmap.yaml │ │ ├── vllm_deployment.yaml │ │ └── vllm_service.yaml │ │ └── values.yaml │ ├── tensorwave70b │ └── scalarlm │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── _helpers.tpl │ │ ├── api_configmap.yaml │ │ ├── api_deployment.yaml │ │ ├── api_service.yaml │ │ ├── cache_pvc.yaml │ │ ├── jobs_pvc.yaml │ │ ├── vllm_configmap.yaml │ │ ├── vllm_deployment.yaml │ │ └── vllm_service.yaml │ │ └── values.yaml │ └── tensorwave8b │ └── scalarlm │ ├── Chart.yaml │ ├── templates │ ├── _helpers.tpl │ ├── api_configmap.yaml │ ├── api_deployment.yaml │ ├── api_service.yaml │ ├── cache_pvc.yaml │ ├── jobs_pvc.yaml │ ├── vllm_configmap.yaml │ ├── vllm_deployment.yaml │ └── vllm_service.yaml │ └── values.yaml ├── docker-compose.yaml ├── docs ├── cray-docs │ ├── docs │ │ ├── arch.md │ │ ├── assets │ │ │ ├── cray-arch.png │ │ │ ├── cray.jpeg │ │ │ └── loss_plot_044db4ac60.png │ │ ├── cli │ │ │ ├── cli.md │ │ │ ├── list-models.md │ │ │ ├── plot.md │ │ │ ├── squeue.md │ │ │ └── training-logs.md │ │ ├── contact.md │ │ ├── deployment │ │ │ ├── docker.md │ │ │ ├── kubernetes.md │ │ │ ├── laptop.md │ │ │ ├── modal-details.md │ │ │ └── modal.md │ │ ├── index.md │ │ ├── inference.md │ │ ├── quickstart.md │ │ └── training.md │ └── mkdocs.yml └── deploy.sh ├── frontend └── assets │ └── logo.svg ├── infra ├── CMakeLists.txt ├── cmake │ ├── cpu_extension.cmake │ ├── hipify.py │ └── utils.cmake ├── cray_infra │ ├── api │ │ ├── fastapi │ │ │ ├── aiohttp │ │ │ │ └── get_global_session.py │ │ │ ├── generate │ │ │ │ ├── embed.py │ │ │ │ ├── finish_work.py │ │ │ │ ├── generate.py │ │ │ │ ├── get_results.py │ │ │ │ ├── get_work.py │ │ │ │ └── poll_for_responses.py │ │ │ ├── health │ │ │ │ └── check_health.py │ │ │ ├── main.py │ │ │ ├── routers │ │ │ │ ├── generate_router.py │ │ │ │ ├── health_router.py │ │ │ │ ├── megatron_router.py │ │ │ │ ├── openai_router.py │ │ │ │ └── request_types │ │ │ │ │ ├── embed_request.py │ │ │ │ │ ├── finish_work_request.py │ │ │ │ │ ├── generate_request.py │ │ │ │ │ ├── generate_response.py │ │ │ │ │ ├── get_results_request.py │ │ │ │ │ ├── get_results_response.py │ │ │ │ │ ├── get_work_request.py │ │ │ │ │ ├── get_work_response.py │ │ │ │ │ ├── list_models_response.py │ │ │ │ │ ├── squeue_response.py │ │ │ │ │ └── train_request.py │ │ │ └── tasks │ │ │ │ └── add_megatron_tasks.py │ │ └── work_queue │ │ │ └── inference_work_queue.py │ ├── generate │ │ └── clear_acked_requests_from_queue.py │ ├── one_server │ │ ├── create_api.py │ │ ├── create_vllm.py │ │ ├── main.py │ │ ├── start_cray_server.py │ │ └── wait_for_vllm.py │ ├── slurm │ │ └── discovery │ │ │ └── discover_clusters.py │ ├── training │ │ ├── distribution_strategy │ │ │ └── fsdp │ │ │ │ └── fsdp.py │ │ ├── get_latest_model.py │ │ ├── get_training_job_info.py │ │ ├── gpu_aware_mpi │ │ │ ├── gpu_aware_mpi.cpp │ │ │ └── setup.py │ │ ├── launch_training_job.py │ │ ├── list_models.py │ │ ├── metrics.py │ │ ├── print_logo.py │ │ ├── register_megatron_models.py │ │ ├── restart_megatron_jobs.py │ │ ├── squeue.py │ │ ├── training_harness.py │ │ ├── training_job_status.py │ │ ├── training_logs_generator.py │ │ ├── upload_training_data.py │ │ └── vllm_model_manager.py │ ├── util │ │ ├── default_config.py │ │ ├── default_job_config.py │ │ ├── get_config.py │ │ └── get_job_config.py │ └── vllm │ │ ├── __init__.py │ │ ├── _core_ext.py │ │ ├── _custom_ops.py │ │ ├── _ipex_ops.py │ │ ├── _version.py │ │ ├── adapter_commons │ │ ├── __init__.py │ │ ├── layers.py │ │ ├── models.py │ │ ├── request.py │ │ ├── utils.py │ │ └── worker_manager.py │ │ ├── assets │ │ ├── __init__.py │ │ ├── audio.py │ │ ├── base.py │ │ ├── image.py │ │ └── video.py │ │ ├── attention │ │ ├── __init__.py │ │ ├── backends │ │ │ ├── __init__.py │ │ │ ├── abstract.py │ │ │ ├── blocksparse_attn.py │ │ │ ├── flash_attn.py │ │ │ ├── flashinfer.py │ │ │ ├── ipex_attn.py │ │ │ ├── openvino.py │ │ │ ├── pallas.py │ │ │ ├── rocm_flash_attn.py │ │ │ ├── torch_sdpa.py │ │ │ ├── utils.py │ │ │ └── xformers.py │ │ ├── layer.py │ │ ├── ops │ │ │ ├── __init__.py │ │ │ ├── blocksparse_attention │ │ │ │ ├── __init__.py │ │ │ │ ├── blocksparse_attention_kernel.py │ │ │ │ ├── interface.py │ │ │ │ └── utils.py │ │ │ ├── ipex_attn.py │ │ │ ├── paged_attn.py │ │ │ ├── prefix_prefill.py │ │ │ └── triton_flash_attention.py │ │ └── selector.py │ │ ├── beam_search.py │ │ ├── block.py │ │ ├── compilation │ │ ├── __init__.py │ │ ├── backends.py │ │ └── wrapper.py │ │ ├── config.py │ │ ├── connections.py │ │ ├── core │ │ ├── __init__.py │ │ ├── block │ │ │ ├── __init__.py │ │ │ ├── block_table.py │ │ │ ├── common.py │ │ │ ├── cpu_gpu_block_allocator.py │ │ │ ├── interfaces.py │ │ │ ├── naive_block.py │ │ │ ├── prefix_caching_block.py │ │ │ └── utils.py │ │ ├── block_manager_v1.py │ │ ├── block_manager_v2.py │ │ ├── embedding_model_block_manager.py │ │ ├── evictor_v1.py │ │ ├── evictor_v2.py │ │ ├── interfaces.py │ │ └── scheduler.py │ │ ├── distributed │ │ ├── __init__.py │ │ ├── communication_op.py │ │ ├── device_communicators │ │ │ ├── __init__.py │ │ │ ├── cuda_wrapper.py │ │ │ ├── custom_all_reduce.py │ │ │ ├── custom_all_reduce_utils.py │ │ │ ├── pynccl.py │ │ │ ├── pynccl_wrapper.py │ │ │ ├── shm_broadcast.py │ │ │ └── tpu_communicator.py │ │ ├── parallel_state.py │ │ └── utils.py │ │ ├── engine │ │ ├── __init__.py │ │ ├── arg_utils.py │ │ ├── async_llm_engine.py │ │ ├── async_timeout.py │ │ ├── llm_engine.py │ │ ├── metrics.py │ │ ├── metrics_types.py │ │ ├── multiprocessing │ │ │ ├── __init__.py │ │ │ ├── client.py │ │ │ └── engine.py │ │ ├── output_processor │ │ │ ├── __init__.py │ │ │ ├── interfaces.py │ │ │ ├── multi_step.py │ │ │ ├── single_step.py │ │ │ ├── stop_checker.py │ │ │ └── util.py │ │ └── protocol.py │ │ ├── entrypoints │ │ ├── __init__.py │ │ ├── api_server.py │ │ ├── chat_utils.py │ │ ├── launcher.py │ │ ├── llm.py │ │ ├── logger.py │ │ └── openai │ │ │ ├── __init__.py │ │ │ ├── api_server.py │ │ │ ├── cli_args.py │ │ │ ├── logits_processors.py │ │ │ ├── protocol.py │ │ │ ├── run_batch.py │ │ │ ├── serving_chat.py │ │ │ ├── serving_completion.py │ │ │ ├── serving_embedding.py │ │ │ ├── serving_engine.py │ │ │ ├── serving_tokenization.py │ │ │ └── tool_parsers │ │ │ ├── __init__.py │ │ │ ├── abstract_tool_parser.py │ │ │ ├── hermes_tool_parser.py │ │ │ ├── internlm2_tool_parser.py │ │ │ ├── llama_tool_parser.py │ │ │ ├── mistral_tool_parser.py │ │ │ └── utils.py │ │ ├── envs.py │ │ ├── executor │ │ ├── __init__.py │ │ ├── cpu_executor.py │ │ ├── distributed_gpu_executor.py │ │ ├── executor_base.py │ │ ├── gpu_executor.py │ │ ├── msgspec_utils.py │ │ ├── multiproc_gpu_executor.py │ │ ├── multiproc_worker_utils.py │ │ ├── multiproc_xpu_executor.py │ │ ├── neuron_executor.py │ │ ├── openvino_executor.py │ │ ├── ray_gpu_executor.py │ │ ├── ray_tpu_executor.py │ │ ├── ray_utils.py │ │ ├── ray_xpu_executor.py │ │ ├── tpu_executor.py │ │ └── xpu_executor.py │ │ ├── forward_context.py │ │ ├── inputs │ │ ├── __init__.py │ │ ├── data.py │ │ ├── parse.py │ │ ├── preprocess.py │ │ └── registry.py │ │ ├── logger.py │ │ ├── logging │ │ ├── __init__.py │ │ └── formatter.py │ │ ├── lora │ │ ├── __init__.py │ │ ├── fully_sharded_layers.py │ │ ├── layers.py │ │ ├── lora.py │ │ ├── models.py │ │ ├── ops │ │ │ ├── __init__.py │ │ │ ├── bgmv_expand.py │ │ │ ├── bgmv_expand_slice.py │ │ │ ├── bgmv_shrink.py │ │ │ ├── sgmv_expand.py │ │ │ ├── sgmv_expand_slice.py │ │ │ ├── sgmv_shrink.py │ │ │ └── utils.py │ │ ├── punica.py │ │ ├── request.py │ │ ├── utils.py │ │ └── worker_manager.py │ │ ├── model_executor │ │ ├── __init__.py │ │ ├── custom_op.py │ │ ├── guided_decoding │ │ │ ├── __init__.py │ │ │ ├── guided_fields.py │ │ │ ├── lm_format_enforcer_decoding.py │ │ │ ├── outlines_decoding.py │ │ │ └── outlines_logits_processors.py │ │ ├── layers │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── fused_moe │ │ │ │ ├── __init__.py │ │ │ │ ├── configs │ │ │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ │ │ ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ │ │ ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ │ │ ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ │ │ ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json │ │ │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ │ │ └── README │ │ │ │ ├── fused_marlin_moe.py │ │ │ │ ├── fused_moe.py │ │ │ │ ├── layer.py │ │ │ │ └── moe_pallas.py │ │ │ ├── layernorm.py │ │ │ ├── linear.py │ │ │ ├── logits_processor.py │ │ │ ├── mamba │ │ │ │ ├── __init__.py │ │ │ │ └── ops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── causal_conv1d.py │ │ │ │ │ └── mamba_ssm.py │ │ │ ├── pooler.py │ │ │ ├── quantization │ │ │ │ ├── __init__.py │ │ │ │ ├── aqlm.py │ │ │ │ ├── awq.py │ │ │ │ ├── awq_marlin.py │ │ │ │ ├── awq_triton.py │ │ │ │ ├── base_config.py │ │ │ │ ├── bitsandbytes.py │ │ │ │ ├── compressed_tensors │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── compressed_tensors.py │ │ │ │ │ ├── compressed_tensors_moe.py │ │ │ │ │ ├── schemes │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── compressed_tensors_scheme.py │ │ │ │ │ │ ├── compressed_tensors_w4a16_24.py │ │ │ │ │ │ ├── compressed_tensors_w8a16_fp8.py │ │ │ │ │ │ ├── compressed_tensors_w8a8_fp8.py │ │ │ │ │ │ ├── compressed_tensors_w8a8_int8.py │ │ │ │ │ │ └── compressed_tensors_wNa16.py │ │ │ │ │ └── utils.py │ │ │ │ ├── deepspeedfp.py │ │ │ │ ├── experts_int8.py │ │ │ │ ├── fbgemm_fp8.py │ │ │ │ ├── fp8.py │ │ │ │ ├── gguf.py │ │ │ │ ├── gptq.py │ │ │ │ ├── gptq_marlin.py │ │ │ │ ├── gptq_marlin_24.py │ │ │ │ ├── ipex_quant.py │ │ │ │ ├── kernels │ │ │ │ │ ├── MPLinearKernel.py │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── machete.py │ │ │ │ │ └── marlin.py │ │ │ │ ├── kv_cache.py │ │ │ │ ├── marlin.py │ │ │ │ ├── modelopt.py │ │ │ │ ├── neuron_quant.py │ │ │ │ ├── qqq.py │ │ │ │ ├── schema.py │ │ │ │ ├── tpu_int8.py │ │ │ │ └── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── layer_utils.py │ │ │ │ │ ├── machete_utils.py │ │ │ │ │ ├── marlin_utils.py │ │ │ │ │ ├── marlin_utils_fp8.py │ │ │ │ │ ├── marlin_utils_test.py │ │ │ │ │ ├── marlin_utils_test_24.py │ │ │ │ │ ├── marlin_utils_test_qqq.py │ │ │ │ │ ├── quant_utils.py │ │ │ │ │ └── w8a8_utils.py │ │ │ ├── rejection_sampler.py │ │ │ ├── resampler.py │ │ │ ├── rotary_embedding.py │ │ │ ├── sampler.py │ │ │ ├── spec_decode_base_sampler.py │ │ │ ├── typical_acceptance_sampler.py │ │ │ └── vocab_parallel_embedding.py │ │ ├── model_loader │ │ │ ├── __init__.py │ │ │ ├── loader.py │ │ │ ├── neuron.py │ │ │ ├── openvino.py │ │ │ ├── tensorizer.py │ │ │ ├── utils.py │ │ │ └── weight_utils.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── arctic.py │ │ │ ├── baichuan.py │ │ │ ├── bart.py │ │ │ ├── blip.py │ │ │ ├── blip2.py │ │ │ ├── bloom.py │ │ │ ├── chameleon.py │ │ │ ├── chatglm.py │ │ │ ├── clip.py │ │ │ ├── commandr.py │ │ │ ├── dbrx.py │ │ │ ├── decilm.py │ │ │ ├── deepseek.py │ │ │ ├── deepseek_v2.py │ │ │ ├── eagle.py │ │ │ ├── exaone.py │ │ │ ├── falcon.py │ │ │ ├── fuyu.py │ │ │ ├── gemma.py │ │ │ ├── gemma2.py │ │ │ ├── gemma2_embedding.py │ │ │ ├── gpt2.py │ │ │ ├── gpt_bigcode.py │ │ │ ├── gpt_j.py │ │ │ ├── gpt_neox.py │ │ │ ├── granite.py │ │ │ ├── granitemoe.py │ │ │ ├── idefics2_vision_model.py │ │ │ ├── interfaces.py │ │ │ ├── interfaces_base.py │ │ │ ├── intern_vit.py │ │ │ ├── internlm2.py │ │ │ ├── internvl.py │ │ │ ├── jais.py │ │ │ ├── jamba.py │ │ │ ├── llama.py │ │ │ ├── llama_embedding.py │ │ │ ├── llava.py │ │ │ ├── llava_next.py │ │ │ ├── llava_next_video.py │ │ │ ├── llava_onevision.py │ │ │ ├── medusa.py │ │ │ ├── minicpm.py │ │ │ ├── minicpm3.py │ │ │ ├── minicpmv.py │ │ │ ├── mixtral.py │ │ │ ├── mixtral_quant.py │ │ │ ├── mllama.py │ │ │ ├── mlp_speculator.py │ │ │ ├── module_mapping.py │ │ │ ├── mpt.py │ │ │ ├── nemotron.py │ │ │ ├── nvlm_d.py │ │ │ ├── olmo.py │ │ │ ├── olmoe.py │ │ │ ├── opt.py │ │ │ ├── orion.py │ │ │ ├── paligemma.py │ │ │ ├── persimmon.py │ │ │ ├── phi.py │ │ │ ├── phi3.py │ │ │ ├── phi3_small.py │ │ │ ├── phi3v.py │ │ │ ├── phimoe.py │ │ │ ├── pixtral.py │ │ │ ├── qwen.py │ │ │ ├── qwen2.py │ │ │ ├── qwen2_moe.py │ │ │ ├── qwen2_rm.py │ │ │ ├── qwen2_vl.py │ │ │ ├── registry.py │ │ │ ├── siglip.py │ │ │ ├── solar.py │ │ │ ├── stablelm.py │ │ │ ├── starcoder2.py │ │ │ ├── ultravox.py │ │ │ ├── utils.py │ │ │ └── xverse.py │ │ ├── parameter.py │ │ ├── pooling_metadata.py │ │ ├── sampling_metadata.py │ │ └── utils.py │ │ ├── multimodal │ │ ├── __init__.py │ │ ├── audio.py │ │ ├── base.py │ │ ├── image.py │ │ ├── registry.py │ │ ├── utils.py │ │ └── video.py │ │ ├── outputs.py │ │ ├── platforms │ │ ├── __init__.py │ │ ├── cpu.py │ │ ├── cuda.py │ │ ├── interface.py │ │ ├── rocm.py │ │ ├── tpu.py │ │ └── xpu.py │ │ ├── plugins │ │ └── __init__.py │ │ ├── pooling_params.py │ │ ├── prompt_adapter │ │ ├── __init__.py │ │ ├── layers.py │ │ ├── models.py │ │ ├── request.py │ │ ├── utils.py │ │ └── worker_manager.py │ │ ├── py.typed │ │ ├── sampling_params.py │ │ ├── scalar_type.py │ │ ├── scripts.py │ │ ├── sequence.py │ │ ├── spec_decode │ │ ├── __init__.py │ │ ├── batch_expansion.py │ │ ├── draft_model_runner.py │ │ ├── interfaces.py │ │ ├── medusa_worker.py │ │ ├── metrics.py │ │ ├── mlp_speculator_worker.py │ │ ├── mqa_scorer.py │ │ ├── multi_step_worker.py │ │ ├── ngram_worker.py │ │ ├── proposer_worker_base.py │ │ ├── smaller_tp_proposer_worker.py │ │ ├── spec_decode_worker.py │ │ ├── target_model_runner.py │ │ ├── top1_proposer.py │ │ └── util.py │ │ ├── tokenformer │ │ ├── __init__.py │ │ └── tokenformer_model_manager.py │ │ ├── tracing.py │ │ ├── transformers_utils │ │ ├── __init__.py │ │ ├── config.py │ │ ├── configs │ │ │ ├── __init__.py │ │ │ ├── arctic.py │ │ │ ├── chatglm.py │ │ │ ├── dbrx.py │ │ │ ├── eagle.py │ │ │ ├── exaone.py │ │ │ ├── falcon.py │ │ │ ├── internvl.py │ │ │ ├── jais.py │ │ │ ├── medusa.py │ │ │ ├── mllama.py │ │ │ ├── mlp_speculator.py │ │ │ ├── mpt.py │ │ │ ├── nemotron.py │ │ │ ├── nvlm_d.py │ │ │ ├── qwen2vl.py │ │ │ ├── solar.py │ │ │ └── ultravox.py │ │ ├── detokenizer.py │ │ ├── processor.py │ │ ├── tokenizer.py │ │ ├── tokenizer_group │ │ │ ├── __init__.py │ │ │ ├── base_tokenizer_group.py │ │ │ ├── ray_tokenizer_group.py │ │ │ └── tokenizer_group.py │ │ ├── tokenizers │ │ │ ├── __init__.py │ │ │ └── mistral.py │ │ └── utils.py │ │ ├── triton_utils │ │ ├── __init__.py │ │ ├── custom_cache_manager.py │ │ ├── importing.py │ │ └── libentry.py │ │ ├── usage │ │ ├── __init__.py │ │ └── usage_lib.py │ │ ├── utils.py │ │ ├── version.py │ │ ├── vllm_flash_attn │ │ └── .gitkeep │ │ └── worker │ │ ├── __init__.py │ │ ├── cache_engine.py │ │ ├── cpu_enc_dec_model_runner.py │ │ ├── cpu_model_runner.py │ │ ├── cpu_worker.py │ │ ├── embedding_model_runner.py │ │ ├── enc_dec_model_runner.py │ │ ├── model_runner.py │ │ ├── model_runner_base.py │ │ ├── multi_step_model_runner.py │ │ ├── multi_step_tpu_worker.py │ │ ├── multi_step_worker.py │ │ ├── neuron_model_runner.py │ │ ├── neuron_worker.py │ │ ├── openvino_model_runner.py │ │ ├── openvino_worker.py │ │ ├── tpu_model_runner.py │ │ ├── tpu_worker.py │ │ ├── utils.py │ │ ├── worker.py │ │ ├── worker_base.py │ │ ├── xpu_model_runner.py │ │ └── xpu_worker.py ├── csrc │ ├── activation_kernels.cu │ ├── attention │ │ ├── attention_dtypes.h │ │ ├── attention_generic.cuh │ │ ├── attention_kernels.cu │ │ ├── attention_utils.cuh │ │ ├── dtype_bfloat16.cuh │ │ ├── dtype_float16.cuh │ │ ├── dtype_float32.cuh │ │ └── dtype_fp8.cuh │ ├── cache.h │ ├── cache_kernels.cu │ ├── core │ │ ├── exception.hpp │ │ ├── registration.h │ │ ├── scalar_type.hpp │ │ └── torch_bindings.cpp │ ├── cpu │ │ ├── activation.cpp │ │ ├── attention.cpp │ │ ├── cache.cpp │ │ ├── cpu_types.hpp │ │ ├── cpu_types_arm.hpp │ │ ├── cpu_types_vsx.hpp │ │ ├── cpu_types_x86.hpp │ │ ├── dnnl_helper.hpp │ │ ├── layernorm.cpp │ │ ├── pos_encoding.cpp │ │ ├── quant.cpp │ │ ├── torch_bindings.cpp │ │ └── utils.cpp │ ├── cuda_compat.h │ ├── cuda_utils.h │ ├── cuda_utils_kernels.cu │ ├── custom_all_reduce.cu │ ├── custom_all_reduce.cuh │ ├── custom_all_reduce_test.cu │ ├── cutlass_extensions │ │ ├── cute_utils.cuh │ │ ├── torch_utils.hpp │ │ ├── vllm_collective_builder.cuh │ │ ├── vllm_custom_types.cuh │ │ ├── vllm_cutlass_library_extension.py │ │ └── vllm_numeric_conversion.cuh │ ├── dispatch_utils.h │ ├── layernorm_kernels.cu │ ├── mamba │ │ ├── causal_conv1d │ │ │ ├── causal_conv1d.cu │ │ │ ├── causal_conv1d.h │ │ │ └── static_switch.h │ │ └── mamba_ssm │ │ │ ├── selective_scan.h │ │ │ ├── selective_scan_fwd.cu │ │ │ └── static_switch.h │ ├── moe │ │ ├── marlin_kernels │ │ │ ├── marlin_moe_kernel.h │ │ │ ├── marlin_moe_kernel_ku4.cu │ │ │ ├── marlin_moe_kernel_ku4.h │ │ │ ├── marlin_moe_kernel_ku4b8.cu │ │ │ ├── marlin_moe_kernel_ku4b8.h │ │ │ ├── marlin_moe_kernel_ku8b128.cu │ │ │ └── marlin_moe_kernel_ku8b128.h │ │ ├── marlin_moe_ops.cu │ │ ├── moe_ops.h │ │ ├── topk_softmax_kernels.cu │ │ └── torch_bindings.cpp │ ├── moe_align_block_size_kernels.cu │ ├── ops.h │ ├── permute_cols.cu │ ├── pos_encoding_kernels.cu │ ├── prepare_inputs │ │ ├── advance_step.cu │ │ └── advance_step.cuh │ ├── quantization │ │ ├── aqlm │ │ │ └── gemm_kernels.cu │ │ ├── awq │ │ │ ├── dequantize.cuh │ │ │ └── gemm_kernels.cu │ │ ├── compressed_tensors │ │ │ └── int8_quant_kernels.cu │ │ ├── cutlass_w8a8 │ │ │ ├── Epilogues.md │ │ │ ├── broadcast_load_epilogue_c2x.hpp │ │ │ ├── broadcast_load_epilogue_c3x.hpp │ │ │ ├── common.hpp │ │ │ ├── scaled_mm_c2x.cu │ │ │ ├── scaled_mm_c2x.cuh │ │ │ ├── scaled_mm_c2x_sm75_dispatch.cuh │ │ │ ├── scaled_mm_c2x_sm80_dispatch.cuh │ │ │ ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh │ │ │ ├── scaled_mm_c2x_sm89_int8_dispatch.cuh │ │ │ ├── scaled_mm_c3x.cu │ │ │ └── scaled_mm_entry.cu │ │ ├── fp8 │ │ │ ├── amd │ │ │ │ ├── hip_float8.h │ │ │ │ ├── hip_float8_impl.h │ │ │ │ └── quant_utils.cuh │ │ │ ├── common.cu │ │ │ ├── fp8_marlin.cu │ │ │ └── nvidia │ │ │ │ └── quant_utils.cuh │ │ ├── gguf │ │ │ ├── dequantize.cuh │ │ │ ├── ggml-common.h │ │ │ ├── gguf_kernel.cu │ │ │ ├── mmq.cuh │ │ │ ├── mmvq.cuh │ │ │ └── vecdotq.cuh │ │ ├── gptq │ │ │ ├── compat.cuh │ │ │ ├── matrix_view.cuh │ │ │ ├── q_gemm.cu │ │ │ ├── qdq_2.cuh │ │ │ ├── qdq_3.cuh │ │ │ ├── qdq_4.cuh │ │ │ ├── qdq_8.cuh │ │ │ └── qdq_util.cuh │ │ ├── gptq_marlin │ │ │ ├── awq_marlin_repack.cu │ │ │ ├── gptq_marlin.cu │ │ │ ├── gptq_marlin_repack.cu │ │ │ ├── marlin.cuh │ │ │ └── marlin_dtypes.cuh │ │ ├── machete │ │ │ ├── Readme.md │ │ │ ├── generate.py │ │ │ ├── machete_collective_builder.cuh │ │ │ ├── machete_interleaving_utils.cuh │ │ │ ├── machete_mainloop.cuh │ │ │ ├── machete_mm_kernel.cuh │ │ │ ├── machete_mm_launcher.cuh │ │ │ ├── machete_prepack_kernel.cuh │ │ │ ├── machete_prepack_launcher.cuh │ │ │ ├── machete_prepacked_layout.cuh │ │ │ └── machete_pytorch.cu │ │ └── marlin │ │ │ ├── dense │ │ │ ├── LICENSE │ │ │ ├── common │ │ │ │ ├── base.h │ │ │ │ └── mem.h │ │ │ └── marlin_cuda_kernel.cu │ │ │ ├── qqq │ │ │ └── marlin_qqq_gemm_kernel.cu │ │ │ └── sparse │ │ │ ├── LICENSE │ │ │ ├── common │ │ │ ├── base.h │ │ │ ├── mem.h │ │ │ └── mma.h │ │ │ └── marlin_24_cuda_kernel.cu │ ├── rocm │ │ ├── attention.cu │ │ ├── ops.h │ │ └── torch_bindings.cpp │ └── torch_bindings.cpp ├── requirements-vllm-build.txt ├── requirements-vllm.txt ├── setup.py ├── slurm_configs │ ├── cgroup.conf │ ├── gres.conf │ ├── munge.key │ ├── slurm.conf │ └── slurm.key ├── slurm_src │ ├── cgroup_docker.c │ └── compile.sh └── util │ └── plot_training.py ├── ml ├── cray_megatron │ ├── collectives │ │ ├── data_parallelism.py │ │ └── main_rank_only.py │ ├── huggingface │ │ └── download_model.py │ ├── main.py │ ├── megatron │ │ ├── dataset │ │ │ ├── data_loader.py │ │ │ └── load_dataset.py │ │ ├── distribution │ │ │ └── apply_distribution_strategy.py │ │ ├── megatron_trainer.py │ │ └── training_loop.py │ └── models │ │ ├── does_any_checkpoint_exist.py │ │ ├── get_latest_checkpoint_path.py │ │ ├── get_model_manager.py │ │ ├── model_manager_base.py │ │ └── tokenformer │ │ ├── load_tokenformer_model.py │ │ └── tokenformer_model_manager.py └── tokenformer │ ├── llama_tokenformer_layers.py │ ├── llama_tokenformer_model.py │ ├── tokenformer_surgeon.py │ └── transformers_tokenformer.py ├── requirements.txt ├── scalarlm ├── scripts ├── cray ├── start_one_server.sh ├── start_slurm.sh └── train_job_entrypoint.sh ├── sdk ├── masint │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── async_supermassive_intelligence.py │ │ └── supermassive_intelligence.py │ ├── cli │ │ ├── __init__.py │ │ ├── ls.py │ │ ├── main.py │ │ ├── plot.py │ │ ├── squeue.py │ │ └── view_logs.py │ ├── engines │ │ ├── __init__.py │ │ ├── async_cray.py │ │ └── cray │ │ │ ├── __init__.py │ │ │ └── submit_training_job.py │ └── util │ │ ├── __init__.py │ │ ├── get_api_base.py │ │ └── make_api_url.py ├── pyproject.toml └── scalarlm │ └── __init__.py └── test ├── benchmark ├── main.py ├── pytorch │ ├── backward.py │ ├── forward.py │ ├── gemm.py │ ├── memcpy.py │ ├── memcpy_peer.py │ └── mpi_p2p.py └── roofline │ ├── plot_bandwidth_sweep.py │ └── plot_roofline.py ├── deployment ├── embed.py ├── generate.py ├── health.py ├── train.py └── train_generate.py ├── infra ├── distribution_strategy │ ├── benchmark_mpi_collectives.py │ ├── benchmark_mpi_sendrecv.py │ └── test_fsdp.py ├── generate.py ├── get_results.py ├── health.py ├── openai_client.py ├── sanity.py ├── slurm.py ├── upload_dataset.py ├── vllm │ └── tokenformer │ │ └── test_tokenformer.py └── vllm_health.py ├── ml ├── rl │ ├── cs_semester.sqlite │ ├── mini-bird.json │ └── sql-reasoning.py ├── sql │ ├── data.json │ ├── train.py │ └── train_generate.py └── tokenformer │ ├── test_llama_tokenformer_model.py │ ├── test_tokenformer.py │ └── test_tokenformer_surgeon.py └── requirements-pytest.txt /.github/workflows/depot-amd.yml: -------------------------------------------------------------------------------- 1 | name: Build AMD image using depot 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | 7 | jobs: 8 | docker-image: 9 | runs-on: depot-ubuntu-24.04-8 10 | steps: 11 | - name: Checkout repo 12 | uses: actions/checkout@v3 13 | 14 | - name: Set up Depot CLI 15 | uses: depot/setup-action@v1 16 | 17 | - name: Login to DockerHub 18 | uses: docker/login-action@v2 19 | with: 20 | username: ${{ secrets.DOCKERHUB_USERNAME }} 21 | password: ${{ secrets.DOCKERHUB_TOKEN }} 22 | 23 | - name: Build and push 24 | uses: depot/build-push-action@v1 25 | env: 26 | DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }} 27 | with: 28 | # if no depot.json file is at the root of your repo, you must specify the project id 29 | project: 39xfdrxfqt 30 | push: true 31 | tags: tensorwave/scalarlm-amd:latest 32 | build-args: | 33 | BASE_NAME=amd 34 | VLLM_TARGET_DEVICE=rocm 35 | PYTORCH_ROCM_ARCH=gfx90a;gfx942 36 | MAX_JOBS=8 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /.github/workflows/depot-cpu.yml: -------------------------------------------------------------------------------- 1 | name: Build CPU image using depot 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | 7 | jobs: 8 | docker-image: 9 | runs-on: depot-ubuntu-22.04-8 10 | steps: 11 | - name: Checkout repo 12 | uses: actions/checkout@v3 13 | 14 | - name: Set up Depot CLI 15 | uses: depot/setup-action@v1 16 | 17 | - name: Login to DockerHub 18 | uses: docker/login-action@v2 19 | with: 20 | username: ${{ secrets.DOCKERHUB_USERNAME }} 21 | password: ${{ secrets.DOCKERHUB_TOKEN }} 22 | 23 | - name: Build and push 24 | uses: depot/build-push-action@v1 25 | env: 26 | DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }} 27 | with: 28 | # if no depot.json file is at the root of your repo, you must specify the project id 29 | project: 39xfdrxfqt 30 | push: true 31 | tags: tensorwave/scalarlm-cpu:latest 32 | build-args: | 33 | BASE_NAME=cpu 34 | VLLM_TARGET_DEVICE=cpu 35 | TORCH_CUDA_ARCH_LIST="" 36 | MAX_JOBS=8 37 | 38 | 39 | -------------------------------------------------------------------------------- /.github/workflows/depot-nvidia-8.0.yml: -------------------------------------------------------------------------------- 1 | name: Build NVIDIA CUDA 8.0 image using depot 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | 7 | jobs: 8 | docker-image: 9 | runs-on: depot-ubuntu-22.04 10 | steps: 11 | - name: Checkout repo 12 | uses: actions/checkout@v3 13 | 14 | - name: Set up Depot CLI 15 | uses: depot/setup-action@v1 16 | 17 | - name: Login to DockerHub 18 | uses: docker/login-action@v2 19 | with: 20 | username: ${{ secrets.DOCKERHUB_USERNAME }} 21 | password: ${{ secrets.DOCKERHUB_TOKEN }} 22 | 23 | - name: Build and push 24 | uses: depot/build-push-action@v1 25 | env: 26 | DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }} 27 | with: 28 | # if no depot.json file is at the root of your repo, you must specify the project id 29 | project: 39xfdrxfqt 30 | push: true 31 | tags: gdiamos/scalarlm-nvidia-8.0:latest 32 | build-args: | 33 | BASE_NAME=nvidia 34 | VLLM_TARGET_DEVICE=cuda 35 | TORCH_CUDA_ARCH_LIST=8.0 36 | MAX_JOBS=2 37 | 38 | 39 | -------------------------------------------------------------------------------- /.github/workflows/depot-nvidia-8.6.yml: -------------------------------------------------------------------------------- 1 | name: Build NVIDIA CUDA 8.6 image using depot 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | 7 | jobs: 8 | docker-image: 9 | runs-on: depot-ubuntu-22.04 10 | steps: 11 | - name: Checkout repo 12 | uses: actions/checkout@v3 13 | 14 | - name: Set up Depot CLI 15 | uses: depot/setup-action@v1 16 | 17 | - name: Login to DockerHub 18 | uses: docker/login-action@v2 19 | with: 20 | username: ${{ secrets.DOCKERHUB_USERNAME }} 21 | password: ${{ secrets.DOCKERHUB_TOKEN }} 22 | 23 | - name: Build and push 24 | uses: depot/build-push-action@v1 25 | env: 26 | DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }} 27 | with: 28 | # if no depot.json file is at the root of your repo, you must specify the project id 29 | project: 39xfdrxfqt 30 | push: true 31 | tags: tensorwave/scalarlm-nvidia-8.6:latest 32 | build-args: | 33 | BASE_NAME=nvidia 34 | VLLM_TARGET_DEVICE=cuda 35 | TORCH_CUDA_ARCH_LIST=8.6 36 | MAX_JOBS=2 37 | 38 | 39 | -------------------------------------------------------------------------------- /.github/workflows/depot-nvidia.yml: -------------------------------------------------------------------------------- 1 | name: Build NVIDIA image using depot 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | 7 | jobs: 8 | docker-image: 9 | runs-on: depot-ubuntu-22.04-8 10 | steps: 11 | - name: Checkout repo 12 | uses: actions/checkout@v3 13 | 14 | - name: Set up Depot CLI 15 | uses: depot/setup-action@v1 16 | 17 | - name: Login to DockerHub 18 | uses: docker/login-action@v2 19 | with: 20 | username: ${{ secrets.DOCKERHUB_USERNAME }} 21 | password: ${{ secrets.DOCKERHUB_TOKEN }} 22 | 23 | - name: Build and push 24 | uses: depot/build-push-action@v1 25 | env: 26 | DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }} 27 | with: 28 | # if no depot.json file is at the root of your repo, you must specify the project id 29 | project: 39xfdrxfqt 30 | push: true 31 | tags: tensorwave/scalarlm-nvidia:latest 32 | build-args: | 33 | BASE_NAME=nvidia 34 | VLLM_TARGET_DEVICE=cuda 35 | TORCH_CUDA_ARCH_LIST=7.0 7.5 8.0 8.6 8.9 9.0 36 | MAX_JOBS=8 37 | 38 | 39 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | name: Build and run unit tests 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | 7 | jobs: 8 | docker-image: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout repo 12 | uses: actions/checkout@v3 13 | 14 | - name: Run tests 15 | run: > 16 | ./cray test 17 | 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.swp 2 | **/*.so 3 | models/* 4 | infra/slurm_configs/slurm.conf 5 | scripts/cray 6 | 7 | *.DS_Store 8 | **/__pycache__/ 9 | .env 10 | .idea 11 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 24.10.0 4 | hooks: 5 | - id: black 6 | 7 | -------------------------------------------------------------------------------- /cmd/bashly.sh: -------------------------------------------------------------------------------- 1 | # e exit on first failure 2 | # x all executed commands are printed to the terminal 3 | # u unset variables are errors 4 | # a export all variables to the environment 5 | # E any trap on ERR is inherited by shell functions 6 | # -o pipefail | produces a failure code if any stage fails 7 | set -Eeuoxa pipefail 8 | 9 | # Get the directory of this script 10 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 11 | 12 | TTY=-t 13 | if test -t 0; then 14 | TTY=-it 15 | fi 16 | 17 | # Run the docker container 18 | docker run --rm $TTY --user $(id -u):$(id -g) \ 19 | --volume "$LOCAL_DIRECTORY:/app/cmd" \ 20 | --volume "$LOCAL_DIRECTORY/../scripts:/app/scripts" \ 21 | --volume "$LOCAL_DIRECTORY/bashly-settings.yml:/app/bashly-settings.yml" \ 22 | dannyben/bashly "$@" 23 | -------------------------------------------------------------------------------- /cmd/benchmark_command.sh: -------------------------------------------------------------------------------- 1 | inspect_args 2 | 3 | target=${args[target]} 4 | visible_gpus=${args[visible-gpus]} 5 | 6 | ./cray build-image $target 7 | 8 | declare -a benchmark_command_parts 9 | benchmark_command_parts=( 10 | "CUDA_VISIBLE_DEVICES=${visible_gpus}" "python" "/app/cray/test/benchmark/main.py" 11 | ) 12 | 13 | benchmark_command="${benchmark_command_parts[*]}" 14 | 15 | echo $command 16 | 17 | # Get the directory of this script 18 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 19 | 20 | # Set cwd to the project root directory 21 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/.. 22 | 23 | declare -a docker_command_parts 24 | 25 | # Make sure the data directory exists 26 | mkdir -p $ROOT_DIRECTORY/data 27 | 28 | docker_command_parts=("docker" "run" "-it" "--rm" "--network" "host" "-v" "$ROOT_DIRECTORY/data:/app/cray/data") 29 | 30 | declare -a gpu_options 31 | 32 | # Set the GPU options depending on the target 33 | if [ "$target" == "cpu" ]; then 34 | gpu_options+=() 35 | elif [ "$target" == "amd" ]; then 36 | gpu_options+=("--device" "/dev/kfd" "--device" "/dev/dri") 37 | else 38 | gpu_options+=("--gpus" "all") 39 | fi 40 | 41 | docker_command_parts+=("${gpu_options[@]}") 42 | docker_command_parts+=("cray:latest" "sh" "-c" "'$benchmark_command'") 43 | 44 | docker_command="${docker_command_parts[*]}" 45 | echo $docker_command 46 | eval $docker_command 47 | 48 | -------------------------------------------------------------------------------- /cmd/build_image_command.sh: -------------------------------------------------------------------------------- 1 | inspect_args 2 | 3 | target=${args[target]} 4 | 5 | declare -a vllm_target_device 6 | declare -a docker_platform 7 | 8 | # If target is cpu, build the image with the cpu base image 9 | if [ "$target" == "cpu" ]; then 10 | vllm_target_device=("cpu") 11 | if [ "$(uname -m)" == "x86_64" ]; then 12 | docker_platform=("linux/amd64") 13 | else 14 | docker_platform=("linux/arm64/v8") 15 | fi 16 | elif [ "$target" == "amd" ]; then 17 | vllm_target_device=("rocm") 18 | docker_platform=("linux/amd64") 19 | else 20 | vllm_target_device=("cuda") 21 | docker_platform=("linux/amd64") 22 | fi 23 | 24 | docker_build_command="docker build --platform ${docker_platform} --build-arg BASE_NAME=${target} --build-arg VLLM_TARGET_DEVICE=${vllm_target_device} -t cray:latest --shm-size=8g ." 25 | 26 | # Run docker build command 27 | echo $(green_bold Building image with command: ${docker_build_command}) 28 | eval $docker_build_command 29 | 30 | echo $(green_bold Successfully built image) 31 | -------------------------------------------------------------------------------- /cmd/depot_build_command.sh: -------------------------------------------------------------------------------- 1 | inspect_args 2 | 3 | target=${args[target]} 4 | 5 | declare -a vllm_target_device 6 | declare -a docker_platform 7 | 8 | # If target is cpu, build the image with the cpu base image 9 | if [ "$target" == "cpu" ]; then 10 | vllm_target_device=("cpu") 11 | docker_platform=("linux/amd64") 12 | elif [ "$target" == "arm" ]; then 13 | vllm_target_device=("cpu") 14 | docker_platform=("linux/arm64/v8") 15 | elif [ "$target" == "amd" ]; then 16 | vllm_target_device=("rocm") 17 | docker_platform=("linux/amd64") 18 | else 19 | vllm_target_device=("cuda") 20 | docker_platform=("linux/amd64") 21 | fi 22 | 23 | docker_build_command="depot build --platform ${docker_platform} --build-arg BASE_NAME=${target} --build-arg VLLM_TARGET_DEVICE=${vllm_target_device} -t gdiamos/cray-${target}:latest --push ." 24 | 25 | # Run docker build command 26 | echo $(green_bold Building image with command: ${docker_build_command}) 27 | eval $docker_build_command 28 | 29 | echo $(green_bold Successfully built image) 30 | 31 | -------------------------------------------------------------------------------- /cmd/llm_logs_command.sh: -------------------------------------------------------------------------------- 1 | inspect_args 2 | 3 | model=${args[model]} 4 | tail=${args[--tail]} 5 | lines=${args[--lines]} 6 | follow=${args[--follow]} 7 | 8 | if [ -z "$model" ]; then 9 | model="latest" 10 | fi 11 | 12 | ./cray build-image 13 | 14 | declare -a log_command_parts 15 | log_command_parts=( 16 | "python" "/app/cray/sdk/masint/cli/main.py" "logs" "--model" "$model" "--lines" "$lines" 17 | ) 18 | 19 | echo $tail 20 | 21 | # If tail exists, add it to the command 22 | if [ -n "$tail" ]; then 23 | log_command_parts+=("--tail") 24 | fi 25 | 26 | # If follow exists, add it to the command 27 | if [ -n "$follow" ]; then 28 | log_command_parts+=("--follow") 29 | fi 30 | 31 | log_command="${log_command_parts[*]}" 32 | 33 | echo $command 34 | 35 | declare -a docker_command_parts 36 | 37 | docker_command_parts=("docker" "run" "-it" "--rm" "--network" "host") 38 | 39 | docker_command_parts+=("cray:latest" "sh" "-c" "'$log_command'") 40 | 41 | docker_command="${docker_command_parts[*]}" 42 | echo $docker_command 43 | eval $docker_command 44 | 45 | 46 | -------------------------------------------------------------------------------- /cmd/llm_ls_command.sh: -------------------------------------------------------------------------------- 1 | inspect_args 2 | 3 | ./cray build-image 4 | 5 | declare -a ls_command_parts 6 | ls_command_parts=( 7 | "python" "/app/cray/sdk/masint/cli/main.py" "ls" 8 | ) 9 | 10 | ls_command="${ls_command_parts[*]}" 11 | 12 | echo $command 13 | 14 | # Get the directory of this script 15 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 16 | 17 | # Set cwd to the project root directory 18 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/.. 19 | 20 | declare -a docker_command_parts 21 | 22 | # Make sure the data directory exists 23 | mkdir -p $ROOT_DIRECTORY/data 24 | 25 | docker_command_parts=("docker" "run" "--rm" "--network" "host") 26 | 27 | docker_command_parts+=("cray:latest" "sh" "-c" "'$ls_command'") 28 | 29 | docker_command="${docker_command_parts[*]}" 30 | echo $docker_command 31 | eval $docker_command 32 | 33 | -------------------------------------------------------------------------------- /cmd/llm_plot_command.sh: -------------------------------------------------------------------------------- 1 | inspect_args 2 | 3 | model=${args[model]} 4 | 5 | if [ -z "$model" ]; then 6 | model="latest" 7 | fi 8 | 9 | ./cray build-image 10 | 11 | declare -a plot_command_parts 12 | plot_command_parts=( 13 | "python" "/app/cray/sdk/masint/cli/main.py" "plot" "--model" "$model" 14 | ) 15 | 16 | plot_command="${plot_command_parts[*]}" 17 | 18 | echo $command 19 | 20 | # Get the directory of this script 21 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 22 | 23 | # Set cwd to the project root directory 24 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/.. 25 | 26 | declare -a docker_command_parts 27 | 28 | # Make sure the data directory exists 29 | mkdir -p $ROOT_DIRECTORY/data 30 | 31 | docker_command_parts=("docker" "run" "--rm" "-v" "$ROOT_DIRECTORY/data:/app/cray/data" "--network" "host") 32 | 33 | docker_command_parts+=("cray:latest" "sh" "-c" "'$plot_command'") 34 | 35 | docker_command="${docker_command_parts[*]}" 36 | echo $docker_command 37 | eval $docker_command 38 | 39 | -------------------------------------------------------------------------------- /cmd/llm_squeue_command.sh: -------------------------------------------------------------------------------- 1 | inspect_args 2 | 3 | ./cray build-image 4 | 5 | declare -a squeue_command_parts 6 | squeue_command_parts=( 7 | "python" "/app/cray/sdk/masint/cli/main.py" "squeue" 8 | ) 9 | 10 | squeue_command="${squeue_command_parts[*]}" 11 | 12 | echo $command 13 | 14 | # Get the directory of this script 15 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 16 | 17 | # Set cwd to the project root directory 18 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/.. 19 | 20 | declare -a docker_command_parts 21 | 22 | # Make sure the data directory exists 23 | mkdir -p $ROOT_DIRECTORY/data 24 | 25 | docker_command_parts=("docker" "run" "--rm" "--network" "host") 26 | 27 | docker_command_parts+=("cray:latest" "sh" "-c" "'$squeue_command'") 28 | 29 | docker_command="${docker_command_parts[*]}" 30 | echo $docker_command 31 | eval $docker_command 32 | 33 | 34 | -------------------------------------------------------------------------------- /cmd/pypi_command.sh: -------------------------------------------------------------------------------- 1 | inspect_args 2 | 3 | # Get the directory of this script 4 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 5 | 6 | # Set cwd to the project sdk directory 7 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/../sdk 8 | 9 | cd $ROOT_DIRECTORY 10 | 11 | # Build sdk wheel from sdk/pyproject.toml 12 | wheel_build_command="python -m build --sdist --wheel --outdir dist/ ." 13 | 14 | # Run sdk wheel build 15 | echo $(green_bold Building wheel with command: ${wheel_build_command}) 16 | eval $wheel_build_command 17 | 18 | echo $(green_bold Successfully built wheel) 19 | 20 | # Upload wheel to pypi 21 | pypi_upload_command="twine upload dist/*" 22 | 23 | # Run pypi upload command 24 | echo $(green_bold Uploading wheel to pypi with command: ${pypi_upload_command}) 25 | eval $pypi_upload_command 26 | 27 | echo $(green_bold Successfully uploaded wheel to pypi) 28 | 29 | -------------------------------------------------------------------------------- /cmd/up_command.sh: -------------------------------------------------------------------------------- 1 | inspect_args 2 | 3 | target=${args[target]} 4 | 5 | declare -a vllm_target_device 6 | declare -a docker_compose_service 7 | 8 | if [ "$target" == "cpu" ]; then 9 | vllm_target_device=("cpu") 10 | docker_compose_service="cray" 11 | elif [ "$target" == "amd" ]; then 12 | vllm_target_device=("rocm") 13 | docker_compose_service="cray-amd" 14 | else 15 | vllm_target_device=("cuda") 16 | docker_compose_service="cray-nvidia" 17 | fi 18 | 19 | BASE_NAME=${target} VLLM_TARGET_DEVICE=${vllm_target_device} docker compose -f docker-compose.yaml up ${docker_compose_service} --build --force-recreate 20 | -------------------------------------------------------------------------------- /deployment/ansible/hosts: -------------------------------------------------------------------------------- 1 | ini 2 | [localhost] 3 | localhost ansible_connection=local 4 | -------------------------------------------------------------------------------- /deployment/helm/amd_multi_node/scalarlm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: 1.0.0 3 | description: A Helm chart for the ScalarLM service 4 | name: scalarlm 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /deployment/helm/amd_multi_node/scalarlm/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | gotemplate 2 | {{- define "scalarlm.fullname" -}} 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}} 4 | {{- end -}} 5 | 6 | {{- define "scalarlm.vllmname" -}} 7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}} 8 | {{- end -}} 9 | 10 | {{- define "scalarlm.labels" -}} 11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }} 12 | app.kubernetes.io/instance: {{ .Release.Name }} 13 | {{- end -}} 14 | 15 | {{- define "scalarlm.vllmlabels" -}} 16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }} 17 | app.kubernetes.io/instance: {{ .Release.Name }} 18 | {{- end -}} 19 | -------------------------------------------------------------------------------- /deployment/helm/amd_multi_node/scalarlm/templates/api_configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/api_configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-api-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}" 12 | server_list: api 13 | max_train_time: {{ .Values.max_train_time }} 14 | 15 | 16 | -------------------------------------------------------------------------------- /deployment/helm/amd_multi_node/scalarlm/templates/api_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "scalarlm.fullname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.api_port }} 9 | targetPort: 8000 10 | protocol: TCP 11 | name: http 12 | externalIPs: 13 | - {{ .Values.service.externalIP }} 14 | selector: 15 | {{- include "scalarlm.labels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/amd_multi_node/scalarlm/templates/vllm_configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-vllm-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | api_url: "http://scalarlm:{{ .Values.service.api_port }}" 12 | server_list: vllm 13 | 14 | 15 | -------------------------------------------------------------------------------- /deployment/helm/amd_multi_node/scalarlm/templates/vllm_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "scalarlm.vllmname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.vllm_port }} 9 | targetPort: 8001 10 | protocol: TCP 11 | name: http 12 | externalIPs: 13 | - {{ .Values.service.externalIP }} 14 | selector: 15 | {{- include "scalarlm.vllmlabels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: 1.0.0 3 | description: A Helm chart for the ScalarLM service 4 | name: scalarlm 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/local-hostpath-sc.yaml: -------------------------------------------------------------------------------- 1 | # local-hostpath-sc.yaml 2 | apiVersion: storage.k8s.io/v1 3 | kind: StorageClass 4 | metadata: 5 | name: local-hostpath 6 | provisioner: kubernetes.io/no-provisioner 7 | volumeBindingMode: Immediate 8 | 9 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/storageclass-clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: storageclass-manager 5 | rules: 6 | - apiGroups: ["storage.k8s.io"] 7 | resources: ["storageclasses"] 8 | verbs: ["get", "list", "create", "delete", "patch", "update"] 9 | 10 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | gotemplate 2 | {{- define "scalarlm.fullname" -}} 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}} 4 | {{- end -}} 5 | 6 | {{- define "scalarlm.vllmname" -}} 7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}} 8 | {{- end -}} 9 | 10 | {{- define "scalarlm.labels" -}} 11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }} 12 | app.kubernetes.io/instance: {{ .Release.Name }} 13 | {{- end -}} 14 | 15 | {{- define "scalarlm.vllmlabels" -}} 16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }} 17 | app.kubernetes.io/instance: {{ .Release.Name }} 18 | {{- end -}} 19 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/templates/api_configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/api_configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-api-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}" 12 | server_list: api 13 | max_train_time: {{ .Values.max_train_time }} 14 | 15 | 16 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/templates/api_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "scalarlm.fullname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.api_port }} 9 | targetPort: 8000 10 | protocol: TCP 11 | name: http 12 | externalIPs: 13 | - {{ .Values.service.externalIP }} 14 | selector: 15 | {{- include "scalarlm.labels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/templates/cache_pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: scalarlm-cache 5 | annotations: 6 | helm.sh/resource-policy: keep 7 | spec: 8 | accessModes: 9 | - ReadWriteOnce 10 | resources: 11 | requests: 12 | storage: {{ .Values.cache_pvc.size }} 13 | storageClassName: {{ .Values.cache_pvc.storageClass }} 14 | wait_until_bound: false 15 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/templates/jobs_pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: scalarlm-jobs 5 | spec: 6 | accessModes: 7 | - ReadWriteOnce 8 | resources: 9 | requests: 10 | storage: {{ .Values.jobs_pvc.size }} 11 | storageClassName: {{ .Values.jobs_pvc.storageClass }} 12 | wait_until_bound: false 13 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/templates/vllm_configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-vllm-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | api_url: "http://scalarlm:{{ .Values.service.api_port }}" 12 | server_list: vllm 13 | 14 | 15 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/templates/vllm_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "scalarlm.vllmname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.vllm_port }} 9 | targetPort: 8001 10 | protocol: TCP 11 | name: http 12 | externalIPs: 13 | - {{ .Values.service.externalIP }} 14 | selector: 15 | {{- include "scalarlm.vllmlabels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_node/scalarlm/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | repository: sudnya/scalarlm-rocm 3 | tag: v0.7 4 | pullPolicy: Always 5 | 6 | env: 7 | - name: HIP_VISIBLE_DEVICES 8 | value: "0" 9 | - name: ROCR_VISIBLE_DEVICES 10 | value: "0" 11 | 12 | service: 13 | type: ClusterIP 14 | api_port: 8000 15 | vllm_port: 8001 16 | externalIP: 10.1.81.248 17 | 18 | jobs_pvc: 19 | storageClass: openebs-hostpath 20 | size: 100Gi 21 | 22 | cache_pvc: 23 | storageClass: openebs-hostpath 24 | size: 32Gi 25 | 26 | model: meta-llama/Llama-3.1-8B-Instruct 27 | max_model_length: 4096 28 | gpu_memory_utilization: 0.95 29 | 30 | training_gpus: 2 31 | inference_gpus: 1 32 | 33 | max_train_time: 86400 34 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_pod/scalarlm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: 1.0.0 3 | description: A Helm chart for the Cray service 4 | name: cray 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_pod/scalarlm/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | gotemplate 2 | {{- define "cray.fullname" -}} 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}} 4 | {{- end -}} 5 | 6 | {{- define "cray.labels" -}} 7 | app.kubernetes.io/name: {{ include "cray.fullname" . }} 8 | app.kubernetes.io/instance: {{ .Release.Name }} 9 | {{- end -}} 10 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_pod/scalarlm/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | 12 | 13 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_pod/scalarlm/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "cray.fullname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.port }} 9 | targetPort: {{ .Values.service.targetPort }} 10 | protocol: TCP 11 | name: http 12 | - port: 8001 13 | targetPort: 8001 14 | protocol: TCP 15 | name: http2 16 | externalIPs: 17 | - {{ .Values.service.externalIP }} 18 | selector: 19 | {{- include "cray.labels" . | nindent 4 }} 20 | -------------------------------------------------------------------------------- /deployment/helm/amd_single_pod/scalarlm/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for cray-chart. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | replicaCount: 1 6 | 7 | image: 8 | repository: sudnya/scalarlm-rocm 9 | tag: latest 10 | pullPolicy: Always 11 | 12 | env: 13 | - name: HIP_VISIBLE_DEVICES 14 | value: "0" 15 | - name: ROCR_VISIBLE_DEVICES 16 | value: "0" 17 | service: 18 | type: ClusterIP 19 | port: 8000 20 | targetPort: 8000 21 | externalIP: 10.1.81.248 22 | 23 | model: meta-llama/Llama-3.1-8B-Instruct 24 | max_model_length: 4096 25 | gpu_memory_utilization: 0.33 26 | -------------------------------------------------------------------------------- /deployment/helm/cray/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /deployment/helm/cray/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: cray 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.1.0 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | #appVersion: "1.16.0" 25 | -------------------------------------------------------------------------------- /deployment/helm/cray/templates/hpa.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.autoscaling.enabled }} 2 | apiVersion: autoscaling/v2 3 | kind: HorizontalPodAutoscaler 4 | metadata: 5 | name: {{ include "cray.fullname" . }} 6 | labels: 7 | {{- include "cray.labels" . | nindent 4 }} 8 | spec: 9 | scaleTargetRef: 10 | apiVersion: apps/v1 11 | kind: Deployment 12 | name: {{ include "cray.fullname" . }} 13 | minReplicas: {{ .Values.autoscaling.minReplicas }} 14 | maxReplicas: {{ .Values.autoscaling.maxReplicas }} 15 | metrics: 16 | {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} 17 | - type: Resource 18 | resource: 19 | name: cpu 20 | target: 21 | type: Utilization 22 | averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} 23 | {{- end }} 24 | {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} 25 | - type: Resource 26 | resource: 27 | name: memory 28 | target: 29 | type: Utilization 30 | averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} 31 | {{- end }} 32 | {{- end }} 33 | -------------------------------------------------------------------------------- /deployment/helm/cray/templates/ingress.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.ingress.enabled -}} 2 | apiVersion: networking.k8s.io/v1 3 | kind: Ingress 4 | metadata: 5 | name: {{ include "cray.fullname" . }} 6 | labels: 7 | {{- include "cray.labels" . | nindent 4 }} 8 | {{- with .Values.ingress.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | spec: 13 | {{- with .Values.ingress.className }} 14 | ingressClassName: {{ . }} 15 | {{- end }} 16 | {{- if .Values.ingress.tls }} 17 | tls: 18 | {{- range .Values.ingress.tls }} 19 | - hosts: 20 | {{- range .hosts }} 21 | - {{ . | quote }} 22 | {{- end }} 23 | secretName: {{ .secretName }} 24 | {{- end }} 25 | {{- end }} 26 | rules: 27 | {{- range .Values.ingress.hosts }} 28 | - host: {{ .host | quote }} 29 | http: 30 | paths: 31 | {{- range .paths }} 32 | - path: {{ .path }} 33 | {{- with .pathType }} 34 | pathType: {{ . }} 35 | {{- end }} 36 | backend: 37 | service: 38 | name: {{ include "cray.fullname" $ }} 39 | port: 40 | number: {{ $.Values.service.port }} 41 | {{- end }} 42 | {{- end }} 43 | {{- end }} 44 | -------------------------------------------------------------------------------- /deployment/helm/cray/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "cray.fullname" . }} 5 | labels: 6 | {{- include "cray.labels" . | nindent 4 }} 7 | spec: 8 | type: {{ .Values.service.type }} 9 | ports: 10 | - port: {{ .Values.service.port }} 11 | targetPort: http 12 | protocol: TCP 13 | name: http 14 | selector: 15 | {{- include "cray.selectorLabels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/cray/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "cray.serviceAccountName" . }} 6 | labels: 7 | {{- include "cray.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }} 13 | {{- end }} 14 | -------------------------------------------------------------------------------- /deployment/helm/cray/templates/tests/test-connection.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: "{{ include "cray.fullname" . }}-test-connection" 5 | labels: 6 | {{- include "cray.labels" . | nindent 4 }} 7 | annotations: 8 | "helm.sh/hook": test 9 | spec: 10 | containers: 11 | - name: wget 12 | image: busybox 13 | command: ['wget'] 14 | args: ['{{ include "cray.fullname" . }}:{{ .Values.service.port }}'] 15 | restartPolicy: Never 16 | -------------------------------------------------------------------------------- /deployment/helm/lambda/cray/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: 1.0.0 3 | description: A Helm chart for the Cray service 4 | name: cray 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /deployment/helm/lambda/cray/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | gotemplate 2 | {{- define "cray.fullname" -}} 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}} 4 | {{- end -}} 5 | 6 | {{- define "cray.labels" -}} 7 | app.kubernetes.io/name: {{ include "cray.fullname" . }} 8 | app.kubernetes.io/instance: {{ .Release.Name }} 9 | {{- end -}} 10 | -------------------------------------------------------------------------------- /deployment/helm/lambda/cray/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | 12 | 13 | -------------------------------------------------------------------------------- /deployment/helm/lambda/cray/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "cray.fullname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.port }} 9 | targetPort: {{ .Values.service.targetPort }} 10 | protocol: TCP 11 | name: http 12 | - port: 8001 13 | targetPort: 8001 14 | protocol: TCP 15 | name: http2 16 | externalIPs: 17 | - {{ .Values.service.externalIP }} 18 | selector: 19 | {{- include "cray.labels" . | nindent 4 }} 20 | -------------------------------------------------------------------------------- /deployment/helm/lambda/cray/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for cray-chart. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | replicaCount: 1 6 | 7 | image: 8 | repository: gdiamos/cray-nvidia 9 | tag: latest 10 | pullPolicy: IfNotPresent 11 | 12 | service: 13 | type: ClusterIP 14 | port: 8000 15 | targetPort: 8000 16 | externalIP: 104.171.203.79 17 | 18 | model: meta-llama/Llama-3.2-3B-Instruct 19 | max_model_length: 4096 20 | gpu_memory_utilization: 0.33 21 | 22 | -------------------------------------------------------------------------------- /deployment/helm/minikube/cray/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: 1.0.0 3 | description: A Helm chart for the Cray service 4 | name: cray 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /deployment/helm/minikube/cray/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | gotemplate 2 | {{- define "cray.fullname" -}} 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}} 4 | {{- end -}} 5 | 6 | {{- define "cray.labels" -}} 7 | app.kubernetes.io/name: {{ include "cray.fullname" . }} 8 | app.kubernetes.io/instance: {{ .Release.Name }} 9 | {{- end -}} 10 | -------------------------------------------------------------------------------- /deployment/helm/minikube/cray/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ include "cray.fullname" . }} 5 | spec: 6 | replicas: {{ .Values.replicaCount }} 7 | selector: 8 | matchLabels: 9 | {{- include "cray.labels" . | nindent 6 }} 10 | template: 11 | metadata: 12 | labels: 13 | {{- include "cray.labels" . | nindent 8 }} 14 | spec: 15 | {{- with .Values.imagePullSecrets }} 16 | imagePullSecrets: 17 | {{- toYaml . | nindent 8 }} 18 | {{- end }} 19 | containers: 20 | - name: {{ .Chart.Name }} 21 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" 22 | imagePullPolicy: {{ .Values.image.pullPolicy }} 23 | command: ["/app/cray/scripts/start_one_server.sh"] 24 | ports: 25 | - name: http 26 | containerPort: 8000 27 | hostPort: 8000 28 | protocol: TCP 29 | - name: http2 30 | containerPort: 8001 31 | hostPort: 8001 32 | protocol: TCP 33 | volumeMounts: 34 | {{- range .Values.volumes }} 35 | - name: {{ .name }} 36 | mountPath: {{ .path }} 37 | {{- end }} 38 | volumes: 39 | {{- range .Values.volumes }} 40 | - name: {{ .name }} 41 | hostPath: 42 | path: {{ .hostPath }} 43 | {{- end }} 44 | -------------------------------------------------------------------------------- /deployment/helm/minikube/cray/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "cray.fullname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.port }} 9 | targetPort: {{ .Values.service.targetPort }} 10 | protocol: TCP 11 | name: http 12 | - port: 8001 13 | targetPort: 8001 14 | protocol: TCP 15 | name: http2 16 | selector: 17 | {{- include "cray.labels" . | nindent 4 }} 18 | -------------------------------------------------------------------------------- /deployment/helm/minikube/cray/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for cray-chart. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | replicaCount: 1 6 | 7 | image: 8 | repository: gdiamos/masint-arm 9 | tag: latest 10 | pullPolicy: IfNotPresent 11 | 12 | service: 13 | type: ClusterIP 14 | port: 8000 15 | targetPort: 8000 16 | 17 | volumes: 18 | - name: ml 19 | path: /app/cray/ml 20 | hostPath: /Users/gregorydiamos/checkout/cray/ml 21 | 22 | network: 23 | name: cray-network 24 | 25 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave3b/scalarlm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: 1.0.0 3 | description: A Helm chart for the ScalarLM service 4 | name: scalarlm 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave3b/scalarlm/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | gotemplate 2 | {{- define "scalarlm.fullname" -}} 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}} 4 | {{- end -}} 5 | 6 | {{- define "scalarlm.vllmname" -}} 7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}} 8 | {{- end -}} 9 | 10 | {{- define "scalarlm.labels" -}} 11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }} 12 | app.kubernetes.io/instance: {{ .Release.Name }} 13 | {{- end -}} 14 | 15 | {{- define "scalarlm.vllmlabels" -}} 16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }} 17 | app.kubernetes.io/instance: {{ .Release.Name }} 18 | {{- end -}} 19 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave3b/scalarlm/templates/api_configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/api_configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-api-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}" 12 | server_list: api 13 | max_train_time: {{ .Values.max_train_time }} 14 | 15 | 16 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave3b/scalarlm/templates/api_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "scalarlm.fullname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.api_port }} 9 | targetPort: 8000 10 | protocol: TCP 11 | name: http 12 | externalIPs: 13 | - {{ .Values.service.externalIP }} 14 | selector: 15 | {{- include "scalarlm.labels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave3b/scalarlm/templates/cache_pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: scalarlm-3b-cache 5 | annotations: 6 | helm.sh/resource-policy: keep 7 | spec: 8 | accessModes: 9 | - ReadWriteOnce 10 | resources: 11 | requests: 12 | storage: {{ .Values.cache_pvc.size }} 13 | storageClassName: {{ .Values.cache_pvc.storageClass }} 14 | wait_until_bound: false 15 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave3b/scalarlm/templates/jobs_pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: scalarlm-3b-jobs 5 | spec: 6 | accessModes: 7 | - ReadWriteOnce 8 | resources: 9 | requests: 10 | storage: {{ .Values.jobs_pvc.size }} 11 | storageClassName: {{ .Values.jobs_pvc.storageClass }} 12 | wait_until_bound: false 13 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave3b/scalarlm/templates/vllm_configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-vllm-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | api_url: "http://scalarlm:{{ .Values.service.api_port }}" 12 | server_list: vllm 13 | 14 | 15 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave3b/scalarlm/templates/vllm_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "scalarlm.vllmname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.vllm_port }} 9 | targetPort: 8001 10 | protocol: TCP 11 | name: http 12 | externalIPs: 13 | - {{ .Values.service.externalIP }} 14 | selector: 15 | {{- include "scalarlm.vllmlabels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave3b/scalarlm/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | repository: tensorwave/scalarlm-amd 3 | tag: latest 4 | pullPolicy: Always 5 | 6 | service: 7 | type: ClusterIP 8 | api_port: 8100 9 | vllm_port: 8101 10 | externalIP: 64.139.222.101 11 | 12 | jobs_pvc: 13 | storageClass: local-path 14 | size: 100Gi 15 | 16 | cache_pvc: 17 | storageClass: local-path 18 | size: 16Gi 19 | 20 | model: meta-llama/Llama-3.2-3B-Instruct 21 | max_model_length: 32768 22 | gpu_memory_utilization: 0.95 23 | 24 | training_gpus: 1 25 | inference_gpus: 1 26 | 27 | max_train_time: 14400 28 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave70b/scalarlm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: 1.0.0 3 | description: A Helm chart for the ScalarLM service 4 | name: scalarlm 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave70b/scalarlm/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | gotemplate 2 | {{- define "scalarlm.fullname" -}} 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}} 4 | {{- end -}} 5 | 6 | {{- define "scalarlm.vllmname" -}} 7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}} 8 | {{- end -}} 9 | 10 | {{- define "scalarlm.labels" -}} 11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }} 12 | app.kubernetes.io/instance: {{ .Release.Name }} 13 | {{- end -}} 14 | 15 | {{- define "scalarlm.vllmlabels" -}} 16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }} 17 | app.kubernetes.io/instance: {{ .Release.Name }} 18 | {{- end -}} 19 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave70b/scalarlm/templates/api_configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/api_configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-api-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}" 12 | server_list: api 13 | max_train_time: {{ .Values.max_train_time }} 14 | 15 | 16 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave70b/scalarlm/templates/api_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "scalarlm.fullname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.api_port }} 9 | targetPort: 8000 10 | protocol: TCP 11 | name: http 12 | externalIPs: 13 | - {{ .Values.service.externalIP }} 14 | selector: 15 | {{- include "scalarlm.labels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave70b/scalarlm/templates/cache_pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: scalarlm-70b-cache 5 | annotations: 6 | helm.sh/resource-policy: keep 7 | spec: 8 | accessModes: 9 | - ReadWriteOnce 10 | resources: 11 | requests: 12 | storage: {{ .Values.cache_pvc.size }} 13 | storageClassName: {{ .Values.cache_pvc.storageClass }} 14 | wait_until_bound: false 15 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave70b/scalarlm/templates/jobs_pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: scalarlm-70b-jobs 5 | spec: 6 | accessModes: 7 | - ReadWriteOnce 8 | resources: 9 | requests: 10 | storage: {{ .Values.jobs_pvc.size }} 11 | storageClassName: {{ .Values.jobs_pvc.storageClass }} 12 | wait_until_bound: false 13 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave70b/scalarlm/templates/vllm_configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-vllm-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | api_url: "http://scalarlm:{{ .Values.service.api_port }}" 12 | server_list: vllm 13 | 14 | 15 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave70b/scalarlm/templates/vllm_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "scalarlm.vllmname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.vllm_port }} 9 | targetPort: 8001 10 | protocol: TCP 11 | name: http 12 | externalIPs: 13 | - {{ .Values.service.externalIP }} 14 | selector: 15 | {{- include "scalarlm.vllmlabels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave70b/scalarlm/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | repository: sudnya/scalarlm-rocm 3 | tag: v0.8 4 | pullPolicy: Always 5 | 6 | service: 7 | type: ClusterIP 8 | api_port: 8200 9 | vllm_port: 8201 10 | externalIP: 64.139.222.101 11 | 12 | jobs_pvc: 13 | storageClass: local-path 14 | size: 100Gi 15 | 16 | cache_pvc: 17 | storageClass: local-path 18 | size: 200Gi 19 | 20 | model: meta-llama/Llama-3.3-70B-Instruct 21 | max_model_length: 4096 22 | gpu_memory_utilization: 0.95 23 | 24 | training_gpus: 2 25 | inference_gpus: 1 26 | 27 | max_train_time: 86400 28 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave8b/scalarlm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: 1.0.0 3 | description: A Helm chart for the ScalarLM service 4 | name: scalarlm 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave8b/scalarlm/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | gotemplate 2 | {{- define "scalarlm.fullname" -}} 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}} 4 | {{- end -}} 5 | 6 | {{- define "scalarlm.vllmname" -}} 7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}} 8 | {{- end -}} 9 | 10 | {{- define "scalarlm.labels" -}} 11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }} 12 | app.kubernetes.io/instance: {{ .Release.Name }} 13 | {{- end -}} 14 | 15 | {{- define "scalarlm.vllmlabels" -}} 16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }} 17 | app.kubernetes.io/instance: {{ .Release.Name }} 18 | {{- end -}} 19 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave8b/scalarlm/templates/api_configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/api_configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-api-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}" 12 | server_list: api 13 | max_train_time: {{ .Values.max_train_time }} 14 | 15 | 16 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave8b/scalarlm/templates/api_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "scalarlm.fullname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.api_port }} 9 | targetPort: 8000 10 | protocol: TCP 11 | name: http 12 | externalIPs: 13 | - {{ .Values.service.externalIP }} 14 | selector: 15 | {{- include "scalarlm.labels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave8b/scalarlm/templates/cache_pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: scalarlm-8b-cache 5 | annotations: 6 | helm.sh/resource-policy: keep 7 | spec: 8 | accessModes: 9 | - ReadWriteOnce 10 | resources: 11 | requests: 12 | storage: {{ .Values.cache_pvc.size }} 13 | storageClassName: {{ .Values.cache_pvc.storageClass }} 14 | wait_until_bound: false 15 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave8b/scalarlm/templates/jobs_pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: scalarlm-8b-jobs 5 | spec: 6 | accessModes: 7 | - ReadWriteOnce 8 | resources: 9 | requests: 10 | storage: {{ .Values.jobs_pvc.size }} 11 | storageClassName: {{ .Values.jobs_pvc.storageClass }} 12 | wait_until_bound: false 13 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave8b/scalarlm/templates/vllm_configmap.yaml: -------------------------------------------------------------------------------- 1 | # templates/configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ .Release.Name }}-vllm-configmap 6 | data: 7 | cray-config.yaml: | 8 | model: {{ .Values.model }} 9 | max_model_length: {{ .Values.max_model_length }} 10 | gpu_memory_utilization: {{ .Values.gpu_memory_utilization }} 11 | api_url: "http://scalarlm:{{ .Values.service.api_port }}" 12 | server_list: vllm 13 | 14 | 15 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave8b/scalarlm/templates/vllm_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "scalarlm.vllmname" . }} 5 | spec: 6 | type: {{ .Values.service.type }} 7 | ports: 8 | - port: {{ .Values.service.vllm_port }} 9 | targetPort: 8001 10 | protocol: TCP 11 | name: http 12 | externalIPs: 13 | - {{ .Values.service.externalIP }} 14 | selector: 15 | {{- include "scalarlm.vllmlabels" . | nindent 4 }} 16 | -------------------------------------------------------------------------------- /deployment/helm/tensorwave8b/scalarlm/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | repository: gdiamos/scalarlm-amd #tensorwave/scalarlm-amd 3 | tag: fsdp 4 | pullPolicy: Always 5 | 6 | service: 7 | type: ClusterIP 8 | api_port: 9000 9 | vllm_port: 9001 10 | externalIP: 64.139.222.101 11 | 12 | jobs_pvc: 13 | storageClass: local-path 14 | size: 100Gi 15 | 16 | cache_pvc: 17 | storageClass: local-path 18 | size: 32Gi 19 | 20 | model: meta-llama/Llama-3.1-8B-Instruct 21 | max_model_length: 4096 22 | gpu_memory_utilization: 0.95 23 | 24 | training_gpus: 2 25 | inference_gpus: 1 26 | 27 | max_train_time: 86400 28 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | 3 | cray: &cray 4 | command: /app/cray/scripts/start_one_server.sh 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | args: 9 | - BASE_NAME=${BASE_NAME} 10 | - VLLM_TARGET_DEVICE=${VLLM_TARGET_DEVICE} 11 | ports: 12 | - "8000:8000" 13 | - "8001:8001" 14 | volumes: 15 | - type: bind 16 | source: ./models 17 | target: /root/.cache/huggingface 18 | - type: bind 19 | source: ./infra/cray_infra 20 | target: /app/cray/infra/cray_infra 21 | - type: bind 22 | source: ./scripts 23 | target: /app/cray/scripts 24 | - type: bind 25 | source: ./ml 26 | target: /app/cray/ml 27 | - type: bind 28 | source: ./test 29 | target: /app/cray/test 30 | networks: 31 | - cray-network 32 | 33 | 34 | cray-nvidia: 35 | <<: *cray 36 | deploy: 37 | resources: 38 | reservations: 39 | devices: 40 | - driver: nvidia 41 | capabilities: [gpu] 42 | 43 | cray-amd: 44 | <<: *cray 45 | devices: 46 | - /dev/kfd 47 | - /dev/dri 48 | security_opt: 49 | - seccomp:unconfined 50 | 51 | 52 | networks: 53 | cray-network: 54 | name: cray_network 55 | 56 | -------------------------------------------------------------------------------- /docs/cray-docs/docs/arch.md: -------------------------------------------------------------------------------- 1 | # ScalarLM 2 | 3 | ScalarLM has three high level APIs: 4 | 5 | * **completions** provides OpenAI client compatibility 6 | * **generate** provides a simple interface for generating text 7 | * **train** provides a simple interface for submitting training jobs 8 | 9 | ![ScalarLM overview](assets/cray-arch.png) 10 | 11 | 12 | Inference is performed by vLLM workers that are orchestrated by pulling requests from a queue. 13 | 14 | Training is performed by Megatron-LM workers that are orchestrated by SLURM. 15 | 16 | Trained models are automatically registered with the inference workers. 17 | -------------------------------------------------------------------------------- /docs/cray-docs/docs/assets/cray-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/docs/cray-docs/docs/assets/cray-arch.png -------------------------------------------------------------------------------- /docs/cray-docs/docs/assets/cray.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/docs/cray-docs/docs/assets/cray.jpeg -------------------------------------------------------------------------------- /docs/cray-docs/docs/assets/loss_plot_044db4ac60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/docs/cray-docs/docs/assets/loss_plot_044db4ac60.png -------------------------------------------------------------------------------- /docs/cray-docs/docs/cli/list-models.md: -------------------------------------------------------------------------------- 1 | # List Models 2 | 3 | ```console 4 | ./cray llm ls 5 | ``` 6 | 7 | This command lists all of the models that have been trained on the ScalarLM server. 8 | 9 | ```console 10 | 69118a251a074f9f9d37a2ddc903243e428d30c3c31ad019cbf62ac777e42e6e 11 | ``` 12 | 13 | ScalarLM names models with a unique identifier based on the input data and training parameters. 14 | 15 | -------------------------------------------------------------------------------- /docs/cray-docs/docs/cli/plot.md: -------------------------------------------------------------------------------- 1 | # Plot 2 | 3 | ```console 4 | ./cray llm plot 5 | ``` 6 | 7 | This command plots the training loss of a specified model. 8 | 9 | If no model is specified, the command will plot the training loss of the most recently trained model. 10 | 11 | ![Plot](../assets/loss_plot_044db4ac60.png) 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/cray-docs/docs/cli/squeue.md: -------------------------------------------------------------------------------- 1 | # squeue 2 | 3 | ```console 4 | ./cray llm squeue 5 | ``` 6 | 7 | This command is a wrapper around the `squeue` command. It is used to display the status of jobs in the training queue. The output is similar to the `squeue` command, but with some additional formatting. 8 | 9 | ```console 10 | JOBID PARTITION NAME USER STATE TIME TIME_LIMI NODES NODELIST(REASON) 11 | 8 short 00f186ab039b root PENDING 0:00 20:00 1 (Priority) 12 | 7 short f1ba9c0eb11b root PENDING 0:00 20:00 1 (Priority) 13 | 6 short 0746261fd1db root PENDING 0:00 20:00 1 (Priority) 14 | 5 short ae55dedbb496 root PENDING 0:00 20:00 1 (Priority) 15 | 4 short d2bc30a36081 root PENDING 0:00 20:00 1 (Priority) 16 | 3 short bce8e63a7bef root PENDING 0:00 20:00 1 (Resources) 17 | 2 short c42b59ab0fb1 root RUNNING 0:34 20:00 1 df294b9206ff 18 | ``` 19 | 20 | 21 | -------------------------------------------------------------------------------- /docs/cray-docs/docs/contact.md: -------------------------------------------------------------------------------- 1 | # Contact Us 2 | 3 | Project ScalarLM is developed by an Artificial Intelligence engineering consortium, built on a philosophy of open collaboration to improve AI systems. Through our collective engineering efforts with industry and academia we continually integrate and improve the accuracy, safety, speed, and efficiency of AI technologies–helping companies and universities around the world build better AI systems that will benefit society. 4 | 5 | [Get in Touch](https://forms.gle/tk6LFVrTQDSQp8L69) 6 | 7 | 8 | * Greg Diamos 9 | * Naila Farooqui 10 | * Sudnya Diamos 11 | * Suhabe Bugrara 12 | 13 | 14 | We accept community contributions and are always looking for new collaborators. If you are interested in contributing to Project ScalarLM, please reach out to us at [Get in Touch](https://forms.gle/tk6LFVrTQDSQp8L69). 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/cray-docs/docs/deployment/docker.md: -------------------------------------------------------------------------------- 1 | # Docker builds 2 | 3 | Check out prebuilt docker containers for different targets: 4 | 5 | | Target | Container | Latest Release v0.5 | 6 | -------- | --------------------------- | ------------------------ | 7 | | NVIDIA | gdiamos/cray-nvidia:latest | gdiamos/cray-nvidia:v0.5 | 8 | | ARM | gdiamos/cray-arm:latest | gdiamos/cray-arm:v0.5 | 9 | | AMD | gdiamos/cray-amd:latest | gdiamos/cray-amd:v0.5 | 10 | | x86 | gdiamos/cray-cpu:latest | gdiamos/cray-cpu:v0.5 | 11 | 12 | For example, to launch a development server on a modern macbook, e.g. m2 13 | 14 | ```bash 15 | docker run -it -p 8000:8000 --entrypoint /app/cray/scripts/start_one_server.sh gdiamos/cray-arm:v0.5 16 | ``` 17 | -------------------------------------------------------------------------------- /docs/cray-docs/docs/deployment/modal.md: -------------------------------------------------------------------------------- 1 | # Modal 2 | 3 | ScalarLM can be deployed on Modal for easy access to GPUs. 4 | 5 | Clone the [ScalarLM repository](https://github.com/tensorwavecloud/scalarlm) and start the server. 6 | 7 | ```console 8 | git clone git@github.com:tensorwavecloud/scalarlm.git 9 | cd cray 10 | ./cray deploy 11 | ``` 12 | 13 | Modal should give you an endpoint you can start using. 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/cray-docs/docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to ScalarLM 2 | 3 | ScalarLM is a fully open source, CC-0 Licensed, integrated LLM inference and training platform. 4 | 5 | ScalarLM builds on top of the vLLM inference engine, the Megatron-LM training framework, and the HuggingFace model hub. It unifies the capabilities of these tools into a single platform, enabling users to easily perform LLM inference and training, and build higher lever applications such as Agents with a twist - they can teach themselves new abilities via back propagation. 6 | 7 | ScalarLM is designed for high peformance. It inherits the distributed training capabilities of Megatron-LM and the optimized inference engine of vLLM. Cray is also designed to be easy to use. It provides an OpenAI compatible server and a simple command line interface for users to interact with the platform. 8 | 9 | ScalarLM is inspired by the work of Seymour Roger Cray (September 28, 1925 – October 5, 1996), an American electrical engineer and supercomputer architect who designed a series of computers that were the fastest in the world for decades, and founded Cray Research, which built many of these machines. Called "the father of supercomputing", Cray has been credited with creating the supercomputer industry. 10 | 11 | Learn more about ScalarLM at our [Blog](https://blog.scalarlm.com) and [GitHub](https://github.com/scalarlm/scalarlm). 12 | 13 | [Get in Touch](https://forms.gle/tk6LFVrTQDSQp8L69) 14 | 15 | ![ScalarLM](assets/cray.jpeg) 16 | 17 | 18 | -------------------------------------------------------------------------------- /docs/cray-docs/docs/inference.md: -------------------------------------------------------------------------------- 1 | # Inference 2 | 3 | 4 | ## OpenAI Compatible Server 5 | 6 | ```console 7 | curl https://meta-llama--llama-3-2-3b-instruct.cray-lm.com/v1/openai/chat/completions \ 8 | -H "Content-Type: application/json" \ 9 | -d '{ 10 | "model": "meta-llama/Llama-3.2-3B-Instruct", 11 | "messages": [ 12 | {"role": "system", "content": "You are a helpful assistant."}, 13 | {"role": "user", "content": "Who won the world series in 2020?"} 14 | ] 15 | }' 16 | ``` 17 | 18 | ## Using the Python client 19 | 20 | You can also use the Python client to interact with the ScalarLM server. 21 | 22 | ```python 23 | 24 | import masint 25 | 26 | masint.api_url = "https://meta-llama--llama-3-2-3b-instruct.cray-lm.com" 27 | 28 | def get_dataset(): 29 | dataset = [] 30 | 31 | count = 4 32 | 33 | for i in range(count): 34 | dataset.append(f"What is {i} + {i}?") 35 | 36 | return dataset 37 | 38 | 39 | llm = masint.SupermassiveIntelligence() 40 | 41 | dataset = get_dataset() 42 | 43 | results = llm.generate(prompts=dataset) 44 | 45 | print(results) 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /docs/cray-docs/docs/training.md: -------------------------------------------------------------------------------- 1 | # Training 2 | 3 | ## Training jobs 4 | 5 | You can also use the Python client to submit training jobs to the ScalarLM server. 6 | 7 | ```python 8 | 9 | import masint 10 | 11 | def get_dataset(): 12 | dataset = [] 13 | 14 | count = 5 15 | 16 | for i in range(count): 17 | dataset.append( 18 | {"input": f"What is {i} + {i}?", "output": str(i + i)} 19 | ) 20 | 21 | return dataset 22 | 23 | 24 | llm = masint.SupermassiveIntelligence() 25 | 26 | dataset = get_dataset() 27 | 28 | status = llm.train(dataset, train_args={"max_steps": 200, "learning_rate": 3e-3}) 29 | 30 | print(status) 31 | ``` 32 | 33 | You get a command line output like this: 34 | 35 | ```console 36 | (environment) gregorydiamos@Air-Gregory cray % python test/deployment/train.py 37 | {'job_id': '1', 'status': 'QUEUED', 'message': 'Training job launched', 'dataset_id': 'dataset', 'job_directory': '/app/cray/jobs/69118a251a074f9f9d37a2ddc903243e428d30c3c31ad019cbf62ac777e42e6e', 'model_name': '69118a251a074f9f9d37a2ddc903243e428d30c3c31ad019cbf62ac777e42e6e'} 38 | ``` 39 | 40 | -------------------------------------------------------------------------------- /docs/cray-docs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: ScalarLM Docs 2 | nav: 3 | - Getting Started: 4 | - Introduction: index.md 5 | - Quick Start: quickstart.md 6 | - Architecture: arch.md 7 | - Contact: contact.md 8 | - Examples: 9 | - Inference: inference.md 10 | - Training: training.md 11 | - Command Line: 12 | - CLI: cli/cli.md 13 | - List Models: cli/list-models.md 14 | - Training Logs: cli/training-logs.md 15 | - Squeue: cli/squeue.md 16 | - Plot: cli/plot.md 17 | - Deployment: 18 | - Laptop: deployment/laptop.md 19 | - Kubernetes: deployment/kubernetes.md 20 | - Modal: deployment/modal.md 21 | - Modal Details: deployment/modal-details.md 22 | - Docker: deployment/docker.md 23 | -------------------------------------------------------------------------------- /docs/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Safely execute this bash script 4 | # e exit on first failure 5 | # x all executed commands are printed to the terminal 6 | # u unset variables are errors 7 | # a export all variables to the environment 8 | # E any trap on ERR is inherited by shell functions 9 | # -o pipefail | produces a failure code if any stage fails 10 | set -Eeuoxa pipefail 11 | 12 | # Get the directory of this script 13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 14 | 15 | # Clean up the old deployment directory 16 | rm -rf $LOCAL_DIRECTORY/gh-pages-deployment 17 | 18 | # Clone the repository 19 | git clone git@github.com:tensorwavecloud/scalarlm $LOCAL_DIRECTORY/gh-pages-deployment 20 | 21 | # Change to the deployment directory 22 | cd $LOCAL_DIRECTORY/gh-pages-deployment 23 | 24 | # Change to the git branch 25 | git checkout gh-pages 26 | 27 | # Copy the local files from cray-docs to the deployment directory 28 | cp $LOCAL_DIRECTORY/cray-docs/mkdocs.yml $LOCAL_DIRECTORY/gh-pages-deployment 29 | cp -r $LOCAL_DIRECTORY/cray-docs/docs $LOCAL_DIRECTORY/gh-pages-deployment/docs 30 | 31 | # Add all the files to the git repository 32 | #git add . 33 | 34 | # Commit the changes 35 | #git commit -m "Deploying the latest documentation" 36 | 37 | # Run mkdocs gh-deploy 38 | mkdocs gh-deploy 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /frontend/assets/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 27 | 28 | 29 | 32 | 33 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/aiohttp/get_global_session.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | 3 | session = None 4 | 5 | 6 | def get_global_session(): 7 | global session 8 | if session is None: 9 | session = aiohttp.ClientSession() 10 | return session 11 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/generate/finish_work.py: -------------------------------------------------------------------------------- 1 | from cray_infra.api.work_queue.inference_work_queue import get_inference_work_queue 2 | 3 | from cray_infra.api.fastapi.routers.request_types.finish_work_request import FinishWorkRequests 4 | 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | async def finish_work(requests : FinishWorkRequests): 10 | inference_work_queue = get_inference_work_queue() 11 | 12 | for request in requests.requests: 13 | logger.debug(f"Finishing work for request {request.request_id}") 14 | 15 | result = inference_work_queue.get_id(id=request.request_id) 16 | 17 | if request.response is not None: 18 | result["response"] = request.response 19 | 20 | if request.error is not None: 21 | result["error"] = request.error 22 | 23 | inference_work_queue.update(id=request.request_id, item=result) 24 | 25 | inference_work_queue.ack(id=request.request_id) 26 | 27 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/generate/get_results.py: -------------------------------------------------------------------------------- 1 | from cray_infra.api.fastapi.routers.request_types.get_results_request import GetResultsRequest 2 | 3 | from cray_infra.api.fastapi.generate.poll_for_responses import poll_for_responses 4 | 5 | 6 | async def get_results(request: GetResultsRequest): 7 | return await poll_for_responses(request.request_ids) 8 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/health/check_health.py: -------------------------------------------------------------------------------- 1 | from cray_infra.api.fastapi.aiohttp.get_global_session import get_global_session 2 | from cray_infra.util.get_config import get_config 3 | 4 | 5 | async def check_health(): 6 | vllm_health = await get_vllm_health() 7 | api_health = "up" 8 | all_health = get_all_health([vllm_health, api_health]) 9 | return {"api": "up", "vllm": vllm_health, "all": all_health} 10 | 11 | 12 | def get_all_health(healths): 13 | if all(health == "up" for health in healths): 14 | return "up" 15 | 16 | if all(health == "down" for health in healths): 17 | return "down" 18 | 19 | return "mixed" 20 | 21 | 22 | async def get_vllm_health(): 23 | try: 24 | session = get_global_session() 25 | config = get_config() 26 | async with session.get(config["vllm_api_url"] + "/health") as resp: 27 | assert resp.status == 200 28 | return "up" 29 | except Exception as e: 30 | return {"status": "down", "reason": str(e)} 31 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/main.py: -------------------------------------------------------------------------------- 1 | from cray_infra.api.fastapi.routers.openai_router import ( 2 | openai_router, 3 | ) 4 | from cray_infra.api.fastapi.routers.megatron_router import ( 5 | megatron_router, 6 | ) 7 | from cray_infra.api.fastapi.routers.health_router import ( 8 | health_router, 9 | ) 10 | from cray_infra.api.fastapi.routers.generate_router import ( 11 | generate_router, 12 | ) 13 | 14 | from cray_infra.api.fastapi.tasks.add_megatron_tasks import ( 15 | add_megatron_tasks, 16 | ) 17 | 18 | from fastapi import FastAPI, Request 19 | from fastapi.middleware.cors import CORSMiddleware 20 | 21 | import logging 22 | import os 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | app = FastAPI(lifespan=add_megatron_tasks) 29 | 30 | app.include_router(openai_router, prefix="/v1") 31 | app.include_router(megatron_router, prefix="/v1") 32 | app.include_router(health_router, prefix="/v1") 33 | app.include_router(generate_router, prefix="/v1") 34 | 35 | 36 | origins = [ 37 | "http://localhost:3000", 38 | ] 39 | 40 | app.add_middleware( 41 | CORSMiddleware, 42 | allow_origins=origins, 43 | allow_credentials=True, 44 | allow_methods=["*"], 45 | allow_headers=["*"], 46 | ) 47 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/health_router.py: -------------------------------------------------------------------------------- 1 | from cray_infra.api.fastapi.health.check_health import check_health 2 | 3 | from fastapi import APIRouter 4 | 5 | from fastapi.responses import JSONResponse 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | health_router = APIRouter(prefix="/health") 11 | 12 | 13 | @health_router.get("") 14 | async def health(): 15 | return await check_health() 16 | 17 | 18 | @health_router.get("/keepalive") 19 | async def health(): 20 | return {"status": "ok"} 21 | 22 | 23 | @health_router.get("/endpoints") 24 | async def list_routes(): 25 | routes = [ 26 | f"Path: {route.path}, Methods: {', '.join(route.methods)}" 27 | for route in health_router.routes 28 | ] 29 | return JSONResponse(content={"endpoints": routes}, media_type="application/json") 30 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/embed_request.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from typing import Optional 4 | 5 | 6 | class EmbedRequest(BaseModel): 7 | model: Optional[str] = None 8 | prompts: list[str] 9 | 10 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/finish_work_request.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from typing import Optional, Union 4 | 5 | 6 | class FinishWorkRequest(BaseModel): 7 | request_id: int 8 | response: Optional[Union[str, list[float]]] = None 9 | error: Optional[str] = None 10 | 11 | 12 | class FinishWorkRequests(BaseModel): 13 | requests: list[FinishWorkRequest] 14 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/generate_request.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from typing import Optional 4 | 5 | 6 | class GenerateRequest(BaseModel): 7 | model: Optional[str] = None 8 | prompts: list[str] 9 | max_tokens: Optional[int] = 16 10 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/generate_response.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from typing import Optional, Union 4 | 5 | class Result(BaseModel): 6 | request_id: int 7 | response: Optional[Union[str, list[float]]] = None 8 | error: Optional[str] = None 9 | 10 | class GenerateResponse(BaseModel): 11 | results: list[Result] 12 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/get_results_request.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from typing import Optional 4 | 5 | class GetResultsRequest(BaseModel): 6 | request_ids: list[int] 7 | 8 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/get_results_response.py: -------------------------------------------------------------------------------- 1 | from cray_infra.api.fastapi.routers.request_types.generate_response import GenerateResponse as GetResultsResponse 2 | 3 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/get_work_request.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from typing import Optional 4 | 5 | class GetWorkRequest(BaseModel): 6 | batch_size: int 7 | 8 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/get_work_response.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from typing import Optional 4 | 5 | 6 | class GetWorkResponse(BaseModel): 7 | prompt: str 8 | request_id: int 9 | request_type: str 10 | model: Optional[str] = None 11 | max_tokens: Optional[int] = None 12 | 13 | 14 | class GetWorkResponses(BaseModel): 15 | requests: list[GetWorkResponse] 16 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/list_models_response.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class ListModelsResponse(BaseModel): 5 | models: list[dict] 6 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/squeue_response.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from typing import Optional 4 | 5 | 6 | class SqueueResponse(BaseModel): 7 | squeue_output : Optional[str] = None 8 | error_message : Optional[str] = None 9 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/routers/request_types/train_request.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from typing import Optional 4 | 5 | 6 | class TrainResponse(BaseModel): 7 | job_status: dict 8 | job_config: dict 9 | deployed: Optional[bool] = False 10 | -------------------------------------------------------------------------------- /infra/cray_infra/api/fastapi/tasks/add_megatron_tasks.py: -------------------------------------------------------------------------------- 1 | from cray_infra.util.get_config import get_config 2 | 3 | from cray_infra.training.restart_megatron_jobs import restart_megatron_jobs 4 | from cray_infra.training.register_megatron_models import register_megatron_models 5 | from cray_infra.generate.clear_acked_requests_from_queue import clear_acked_requests_from_queue 6 | 7 | from fastapi_utils.tasks import repeat_every 8 | 9 | from contextlib import asynccontextmanager 10 | 11 | import traceback 12 | import sys 13 | import logging 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | @asynccontextmanager 19 | async def add_megatron_tasks(app): 20 | config = get_config() 21 | 22 | megatron_refresh_period = config["megatron_refresh_period"] 23 | 24 | @repeat_every(seconds=megatron_refresh_period) 25 | async def run_megatron_tasks(): 26 | try: 27 | await register_megatron_models() 28 | await restart_megatron_jobs() 29 | await clear_acked_requests_from_queue() 30 | except Exception as e: 31 | print_exception() 32 | raise e 33 | 34 | await run_megatron_tasks() 35 | 36 | yield 37 | 38 | 39 | def print_exception(): 40 | exc_type, exc_value, exc_traceback = sys.exc_info() 41 | messages = traceback.format_exception(exc_type, exc_value, exc_traceback) 42 | 43 | logger.error("".join(messages)) 44 | -------------------------------------------------------------------------------- /infra/cray_infra/generate/clear_acked_requests_from_queue.py: -------------------------------------------------------------------------------- 1 | from cray_infra.api.work_queue.inference_work_queue import get_inference_work_queue 2 | 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | async def clear_acked_requests_from_queue(): 8 | inference_work_queue = get_inference_work_queue() 9 | 10 | starting_size = len(inference_work_queue) 11 | 12 | inference_work_queue.clear_acked_data() 13 | 14 | ending_size = len(inference_work_queue) 15 | 16 | logger.info(f"Cleared {starting_size - ending_size} acked requests from the queue.") 17 | 18 | -------------------------------------------------------------------------------- /infra/cray_infra/one_server/create_api.py: -------------------------------------------------------------------------------- 1 | import uvicorn 2 | 3 | 4 | async def create_api(port, running_status): 5 | server_config = uvicorn.Config( 6 | "cray_infra.api.fastapi.main:app", 7 | host="0.0.0.0", 8 | port=port, 9 | log_level="info", 10 | ) 11 | server = uvicorn.Server(server_config) 12 | running_status.servers.append(server) 13 | 14 | await server.serve() 15 | -------------------------------------------------------------------------------- /infra/cray_infra/one_server/create_vllm.py: -------------------------------------------------------------------------------- 1 | from cray_infra.util.get_config import get_config 2 | 3 | from vllm.entrypoints.openai.api_server import run_server 4 | from vllm.entrypoints.openai.cli_args import make_arg_parser 5 | from vllm.utils import FlexibleArgumentParser 6 | 7 | import torch 8 | 9 | import uvicorn 10 | import os 11 | 12 | import logging 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | async def create_vllm(port, running_status): 17 | 18 | os.environ["HUGGING_FACE_HUB_TOKEN"] = "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" 19 | 20 | config = get_config() 21 | 22 | parser = FlexibleArgumentParser( 23 | description="vLLM OpenAI-Compatible RESTful API server." 24 | ) 25 | parser = make_arg_parser(parser) 26 | args = [ 27 | f"--dtype={config['dtype']}", 28 | f"--max-model-len={config['max_model_length']}", 29 | f"--max-num-batched-tokens={config['max_model_length']}", 30 | f"--max-seq-len-to-capture={config['max_model_length']}", 31 | f"--gpu-memory-utilization={config['gpu_memory_utilization']}", 32 | f"--max-log-len={config['max_log_length']}", 33 | f"--swap-space=0", 34 | "--enable-lora", 35 | "--disable-async-output-proc", # Disable async output processing for embeddings 36 | ] 37 | 38 | if torch.cuda.is_available(): 39 | args.append("--device=cuda") 40 | 41 | args = parser.parse_args(args=args) 42 | 43 | args.port = port 44 | args.model = config["model"] 45 | 46 | logger.info(f"Running vLLM with args: {args}") 47 | 48 | await run_server(args, running_status) 49 | -------------------------------------------------------------------------------- /infra/cray_infra/one_server/start_cray_server.py: -------------------------------------------------------------------------------- 1 | from cray_infra.one_server.create_api import create_api 2 | from cray_infra.one_server.create_vllm import create_vllm 3 | 4 | import asyncio 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | async def start_cray_server(server_list: list): 11 | 12 | running_status = ServerStatus() 13 | 14 | logger.debug(f"Starting servers: {server_list}") 15 | 16 | if ("api" in server_list) or ("all" in server_list): 17 | logger.debug("Starting API server") 18 | api_task = asyncio.create_task( 19 | create_api(port=8000, running_status=running_status) 20 | ) 21 | running_status.tasks.append(api_task) 22 | 23 | if ("vllm" in server_list) or ("all" in server_list): 24 | logger.debug("Starting VLLM server") 25 | vllm_task = asyncio.create_task( 26 | create_vllm(port=8001, running_status=running_status) 27 | ) 28 | running_status.tasks.append(vllm_task) 29 | 30 | return running_status 31 | 32 | 33 | class ServerStatus: 34 | def __init__(self): 35 | self.servers = [] 36 | self.tasks = [] 37 | 38 | async def shutdown(self): 39 | for task in self.tasks: 40 | logger.debug(f"Task {task} is cancelled") 41 | task.cancel() 42 | 43 | for server in self.servers: 44 | logger.debug(f"Server {server} is cancelled") 45 | await server.shutdown() 46 | -------------------------------------------------------------------------------- /infra/cray_infra/one_server/wait_for_vllm.py: -------------------------------------------------------------------------------- 1 | from cray_infra.util.get_config import get_config 2 | 3 | import asyncio 4 | import aiohttp 5 | 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | async def wait_for_vllm(): 12 | for _ in range(30): 13 | health_status = await get_vllm_health() 14 | if health_status == 200: 15 | return 16 | await asyncio.sleep(1) 17 | 18 | 19 | async def get_vllm_health(): 20 | config = get_config() 21 | 22 | try: 23 | async with aiohttp.ClientSession() as session: 24 | async with session.get(config["vllm_api_url"] + "/health") as response: 25 | return response.status 26 | except Exception as e: 27 | logger.error(f"Error getting health: {e}") 28 | return 500 29 | -------------------------------------------------------------------------------- /infra/cray_infra/training/get_latest_model.py: -------------------------------------------------------------------------------- 1 | from cray_infra.util.get_config import get_config 2 | 3 | import os 4 | import json 5 | 6 | 7 | def get_latest_model(): 8 | config = get_config() 9 | 10 | if not os.path.exists(config["training_job_directory"]): 11 | raise FileNotFoundError("No training jobs found") 12 | 13 | # Get the latest model by timestamp 14 | models = os.listdir(config["training_job_directory"]) 15 | 16 | if len(models) == 0: 17 | raise FileNotFoundError("No training jobs found") 18 | 19 | models.sort( 20 | key=lambda x: get_start_time(os.path.join(config["training_job_directory"], x)), 21 | reverse=True, 22 | ) 23 | 24 | model_name = models[0] 25 | 26 | return model_name 27 | 28 | 29 | def get_start_time(path): 30 | with open(os.path.join(path, "status.json")) as f: 31 | status = json.load(f) 32 | 33 | if "history" not in status: 34 | return 0 35 | 36 | return status.get("start_time", 0) 37 | -------------------------------------------------------------------------------- /infra/cray_infra/training/metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from gpu_aware_mpi import get_rank 3 | 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | def log_gpu_memory(prefix=""): 9 | for i in range(torch.cuda.device_count()): 10 | free, total = torch.cuda.mem_get_info(i) 11 | rank = get_rank() 12 | if rank == 0: 13 | logger.debug(f"{prefix} GPU {i}: Free={free/1e6:.2f}MB, Total={total/1e6:.2f}MB") 14 | 15 | def get_model_memory_footprint(model): 16 | param_size = 0 17 | for param in model.parameters(): 18 | param_size += param.numel() * param.element_size() 19 | buffer_size = 0 20 | for buffer in model.buffers(): 21 | buffer_size += buffer.numel() * buffer.element_size() 22 | total_size = param_size + buffer_size 23 | return total_size # in bytes -------------------------------------------------------------------------------- /infra/cray_infra/training/squeue.py: -------------------------------------------------------------------------------- 1 | from cray_infra.api.fastapi.routers.request_types.squeue_response import SqueueResponse 2 | 3 | import subprocess 4 | 5 | 6 | async def squeue(): 7 | try: 8 | squeue_output = subprocess.check_output( 9 | ["squeue", '--format=%.18i %.9P %.12j %.8u %.8T %.10M %.9l %.6D %R'] 10 | ) 11 | 12 | return SqueueResponse( 13 | squeue_output=squeue_output.decode("utf-8"), 14 | ) 15 | 16 | except subprocess.CalledProcessError: 17 | return SqueueResponse( 18 | error_message="squeue command failed", 19 | ) 20 | -------------------------------------------------------------------------------- /infra/cray_infra/training/training_job_status.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class TrainingJobStatus(str, Enum): 5 | QUEUED = "QUEUED" 6 | TRAINING = "TRAINING" 7 | COMPLETED = "COMPLETED" 8 | FAILED = "FAILED" 9 | -------------------------------------------------------------------------------- /infra/cray_infra/training/vllm_model_manager.py: -------------------------------------------------------------------------------- 1 | class VLLMModelManager: 2 | def __init__(self): 3 | self._models = [] 4 | 5 | def set_registered_models(self, models): 6 | self._models = models 7 | 8 | def get_registered_models(self): 9 | return self._models 10 | 11 | def find_model(self, model_name): 12 | for model in self._models: 13 | if model_name in model: 14 | return model 15 | return None 16 | 17 | 18 | def get_vllm_model_manager(): 19 | """ 20 | Returns a singleton instance of VLLMModelManager. 21 | """ 22 | if not hasattr(get_vllm_model_manager, "_instance"): 23 | get_vllm_model_manager._instance = VLLMModelManager() 24 | return get_vllm_model_manager._instance 25 | -------------------------------------------------------------------------------- /infra/cray_infra/util/default_config.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class Config(BaseModel): 5 | api_url: str = "http://localhost:8000" 6 | 7 | model: str = "meta-llama/llama-3.1-8b-instruct" 8 | 9 | # 10GB using 1024 for KB, 1024 for MB, 1024 for GB 10 | max_upload_file_size: int = 1024 * 1024 * 1024 * 10 11 | 12 | train_job_entrypoint: str = "/app/cray/scripts/train_job_entrypoint.sh" 13 | training_job_directory: str = "/app/cray/jobs" 14 | 15 | max_train_time: int = 15 * 60 16 | extra_training_seconds: int = 300 # 5 minutes buffer before slurm kills the job 17 | 18 | slurm_wait_time: int = 30 # seconds 19 | 20 | megatron_refresh_period: int = 30 # seconds 21 | 22 | vllm_api_url: str = "http://localhost:8001" 23 | 24 | generate_batch_size: int = 1024 25 | 26 | response_timeout: int = 60 # seconds 27 | inference_work_queue_timeout: int = 30 # seconds 28 | 29 | inference_work_queue_path: str = "/app/cray/inference_work_queue.sqlite" 30 | 31 | gpu_memory_utilization: float = 0.50 32 | max_model_length: int = 8192 33 | dtype: str = "bfloat16" 34 | 35 | max_log_length: int = 100 36 | 37 | server_list: str = "all" 38 | 39 | tokenformer_r: int = 32 40 | tokenformer_num_heads: int = 4 41 | 42 | tokenformer_cache_capacity: int = 2 43 | 44 | -------------------------------------------------------------------------------- /infra/cray_infra/util/default_job_config.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from typing import Optional 4 | 5 | 6 | class LoraConfig(BaseModel): 7 | r: int = 32 8 | target_modules: str = "all-linear" 9 | use_rslora: bool = True 10 | modules_to_save: list = ["lm_head"] 11 | 12 | 13 | class DiffusionForcingModelConfig(BaseModel): 14 | num_hidden_layers: int = 2 15 | num_diffusion_iterations: int = 3 16 | diffusion_step_size: int = 2 17 | hidden_size: int = 128 18 | num_attention_heads: int = 4 19 | attention_dropout: float = 0.1 20 | 21 | 22 | class JobConfig(BaseModel): 23 | 24 | job_directory: str 25 | training_data_path: str 26 | dataset_hash: str 27 | 28 | #llm_name: str = "masint/tiny-random-llama" 29 | llm_name: str = "meta-llama/Llama-3.2-1B-Instruct" 30 | 31 | # Training 32 | max_steps: int = 100 33 | learning_rate: float = 3e-3 34 | batch_size: int = 1 35 | gradient_clip_value: float = 1.0 36 | 37 | max_token_block_size: int = 16777216 # 16 mega tokens 38 | 39 | # Checkpointing 40 | steps_per_checkpoint: int = 100 41 | max_checkpoints_to_keep: int = 3 42 | 43 | gpus: int = 1 44 | nodes: int = 1 45 | 46 | lora_config: Optional[LoraConfig] = LoraConfig() 47 | diffusion_forcing_config: Optional[DiffusionForcingModelConfig] = ( 48 | DiffusionForcingModelConfig() 49 | ) 50 | 51 | # 4 hours in seconds 52 | timeout: int = 4 * 60 * 60 53 | 54 | training_history_length: int = 1024 55 | 56 | -------------------------------------------------------------------------------- /infra/cray_infra/util/get_config.py: -------------------------------------------------------------------------------- 1 | from cray_infra.util.default_config import Config 2 | 3 | import os 4 | import yaml 5 | 6 | 7 | def get_config(): 8 | loaded_config = {} 9 | 10 | config_path = "/app/cray/cray-config.yaml" 11 | 12 | if os.path.exists(config_path): 13 | with open(config_path, "r") as stream: 14 | loaded_config = yaml.safe_load(stream) 15 | 16 | return Config(**loaded_config).dict() 17 | -------------------------------------------------------------------------------- /infra/cray_infra/util/get_job_config.py: -------------------------------------------------------------------------------- 1 | from cray_infra.util.default_job_config import JobConfig 2 | 3 | import yaml 4 | import os 5 | 6 | 7 | def get_job_config(): 8 | job_config_path = get_job_config_path() 9 | 10 | with open(job_config_path, "r") as stream: 11 | job_config = yaml.safe_load(stream) 12 | 13 | # fill in missing values with defaults 14 | job_config = JobConfig(**job_config).dict() 15 | 16 | return job_config 17 | 18 | 19 | def get_job_config_path(): 20 | assert ( 21 | "CRAY_TRAINING_JOB_CONFIG_PATH" in os.environ 22 | ), "CRAY_TRAINING_JOB_CONFIG_PATH not set" 23 | return os.environ["CRAY_TRAINING_JOB_CONFIG_PATH"] 24 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/__init__.py: -------------------------------------------------------------------------------- 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" 2 | 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine 5 | from vllm.engine.llm_engine import LLMEngine 6 | from vllm.entrypoints.llm import LLM 7 | from vllm.executor.ray_utils import initialize_ray_cluster 8 | from vllm.inputs import PromptType, TextPrompt, TokensPrompt 9 | from vllm.model_executor.models import ModelRegistry 10 | from vllm.outputs import ( 11 | CompletionOutput, 12 | EmbeddingOutput, 13 | EmbeddingRequestOutput, 14 | RequestOutput, 15 | ) 16 | from vllm.pooling_params import PoolingParams 17 | from vllm.sampling_params import SamplingParams 18 | 19 | from .version import __version__, __version_tuple__ 20 | 21 | __all__ = [ 22 | "__version__", 23 | "__version_tuple__", 24 | "LLM", 25 | "ModelRegistry", 26 | "PromptType", 27 | "TextPrompt", 28 | "TokensPrompt", 29 | "SamplingParams", 30 | "RequestOutput", 31 | "CompletionOutput", 32 | "EmbeddingOutput", 33 | "EmbeddingRequestOutput", 34 | "LLMEngine", 35 | "EngineArgs", 36 | "AsyncLLMEngine", 37 | "AsyncEngineArgs", 38 | "initialize_ray_cluster", 39 | "PoolingParams", 40 | ] 41 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/_version.py: -------------------------------------------------------------------------------- 1 | # file generated by setuptools_scm 2 | # don't change, don't track in version control 3 | TYPE_CHECKING = False 4 | if TYPE_CHECKING: 5 | from typing import Tuple, Union 6 | 7 | VERSION_TUPLE = Tuple[Union[int, str], ...] 8 | else: 9 | VERSION_TUPLE = object 10 | 11 | version: str 12 | __version__: str 13 | __version_tuple__: VERSION_TUPLE 14 | version_tuple: VERSION_TUPLE 15 | 16 | __version__ = version = "0.1.dev5+g815064c.d20241108" 17 | __version_tuple__ = version_tuple = (0, 1, "dev5", "g815064c.d20241108") 18 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/adapter_commons/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/adapter_commons/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/adapter_commons/layers.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Tuple 3 | 4 | 5 | @dataclass 6 | class AdapterMapping: 7 | # Per every token in input_ids: 8 | index_mapping: Tuple[int, ...] 9 | # Per sampled token: 10 | prompt_mapping: Tuple[int, ...] 11 | 12 | def __post_init__(self): 13 | self.index_mapping = tuple(self.index_mapping) 14 | self.prompt_mapping = tuple(self.prompt_mapping) 15 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/adapter_commons/request.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class AdapterRequest(ABC): 5 | """ 6 | Base class for adapter requests. 7 | """ 8 | 9 | @property 10 | @abstractmethod 11 | def adapter_id(self) -> int: 12 | raise NotImplementedError 13 | 14 | def __post_init__(self) -> None: 15 | if self.adapter_id < 1: 16 | raise ValueError(f"id must be > 0, got {self.adapter_id}") 17 | 18 | def __eq__(self, value: object) -> bool: 19 | return isinstance(value, self.__class__) and self.adapter_id == value.adapter_id 20 | 21 | def __hash__(self) -> int: 22 | return hash(self.adapter_id) 23 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/adapter_commons/worker_manager.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Optional, Set 3 | 4 | import torch 5 | 6 | 7 | class AbstractWorkerManager(ABC): 8 | 9 | def __init__(self, device: torch.device): 10 | self.device = device 11 | 12 | @property 13 | @abstractmethod 14 | def is_enabled(self) -> bool: 15 | raise NotImplementedError 16 | 17 | @abstractmethod 18 | def set_active_adapters(self, requests: Set[Any], mapping: Optional[Any]) -> None: 19 | raise NotImplementedError 20 | 21 | @abstractmethod 22 | def add_adapter(self, adapter_request: Any) -> bool: 23 | raise NotImplementedError 24 | 25 | @abstractmethod 26 | def remove_adapter(self, adapter_id: int) -> bool: 27 | raise NotImplementedError 28 | 29 | @abstractmethod 30 | def remove_all_adapters(self) -> None: 31 | raise NotImplementedError 32 | 33 | @abstractmethod 34 | def list_adapters(self) -> Set[int]: 35 | raise NotImplementedError 36 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/assets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/assets/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/assets/audio.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal, Tuple 3 | from urllib.parse import urljoin 4 | 5 | import librosa 6 | import numpy as np 7 | 8 | from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL 9 | 10 | ASSET_DIR = "multimodal_asset" 11 | 12 | 13 | @dataclass(frozen=True) 14 | class AudioAsset: 15 | name: Literal["winning_call", "mary_had_lamb"] 16 | 17 | @property 18 | def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]: 19 | 20 | audio_path = get_vllm_public_assets( 21 | filename=f"{self.name}.ogg", s3_prefix=ASSET_DIR 22 | ) 23 | y, sr = librosa.load(audio_path, sr=None) 24 | assert isinstance(sr, int) 25 | return y, sr 26 | 27 | @property 28 | def url(self) -> str: 29 | return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg") 30 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/assets/base.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import vllm.envs as envs 6 | from vllm.connections import global_http_connection 7 | from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT 8 | 9 | vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com" 10 | 11 | 12 | def get_cache_dir() -> Path: 13 | """Get the path to the cache for storing downloaded assets.""" 14 | path = Path(envs.VLLM_ASSETS_CACHE) 15 | path.mkdir(parents=True, exist_ok=True) 16 | 17 | return path 18 | 19 | 20 | @lru_cache 21 | def get_vllm_public_assets(filename: str, s3_prefix: Optional[str] = None) -> Path: 22 | """ 23 | Download an asset file from ``s3://vllm-public-assets`` 24 | and return the path to the downloaded file. 25 | """ 26 | asset_directory = get_cache_dir() / "vllm_public_assets" 27 | asset_directory.mkdir(parents=True, exist_ok=True) 28 | 29 | asset_path = asset_directory / filename 30 | if not asset_path.exists(): 31 | if s3_prefix is not None: 32 | filename = s3_prefix + "/" + filename 33 | global_http_connection.download_file( 34 | f"{vLLM_S3_BUCKET_URL}/{filename}", 35 | asset_path, 36 | timeout=VLLM_IMAGE_FETCH_TIMEOUT, 37 | ) 38 | 39 | return asset_path 40 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/assets/image.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | import torch 5 | from PIL import Image 6 | 7 | from vllm.assets.base import get_vllm_public_assets 8 | 9 | VLM_IMAGES_DIR = "vision_model_images" 10 | 11 | 12 | @dataclass(frozen=True) 13 | class ImageAsset: 14 | name: Literal["stop_sign", "cherry_blossom"] 15 | 16 | @property 17 | def pil_image(self) -> Image.Image: 18 | 19 | image_path = get_vllm_public_assets( 20 | filename=f"{self.name}.jpg", s3_prefix=VLM_IMAGES_DIR 21 | ) 22 | return Image.open(image_path) 23 | 24 | @property 25 | def image_embeds(self) -> torch.Tensor: 26 | """ 27 | Image embeddings, only used for testing purposes with llava 1.5. 28 | """ 29 | image_path = get_vllm_public_assets( 30 | filename=f"{self.name}.pt", s3_prefix=VLM_IMAGES_DIR 31 | ) 32 | return torch.load(image_path) 33 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/attention/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.attention.backends.abstract import ( 2 | AttentionBackend, 3 | AttentionMetadata, 4 | AttentionMetadataBuilder, 5 | AttentionState, 6 | AttentionType, 7 | ) 8 | from vllm.attention.layer import Attention 9 | from vllm.attention.selector import get_attn_backend 10 | 11 | __all__ = [ 12 | "Attention", 13 | "AttentionBackend", 14 | "AttentionMetadata", 15 | "AttentionType", 16 | "AttentionMetadataBuilder", 17 | "Attention", 18 | "AttentionState", 19 | "get_attn_backend", 20 | ] 21 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/attention/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/attention/backends/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/attention/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/attention/ops/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/attention/ops/blocksparse_attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/attention/ops/blocksparse_attention/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/compilation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/compilation/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/core/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/core/block/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | from .communication_op import * 2 | from .parallel_state import * 3 | from .utils import * 4 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/distributed/communication_op.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Union 2 | 3 | import torch 4 | import torch.distributed 5 | 6 | from .parallel_state import get_tp_group 7 | 8 | 9 | def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: 10 | """All-reduce the input tensor across model parallel group.""" 11 | return get_tp_group().all_reduce(input_) 12 | 13 | 14 | def tensor_model_parallel_all_gather( 15 | input_: torch.Tensor, dim: int = -1 16 | ) -> torch.Tensor: 17 | """All-gather the input tensor across model parallel group.""" 18 | return get_tp_group().all_gather(input_, dim) 19 | 20 | 21 | def tensor_model_parallel_gather( 22 | input_: torch.Tensor, dst: int = 0, dim: int = -1 23 | ) -> Optional[torch.Tensor]: 24 | """Gather the input tensor across model parallel group.""" 25 | return get_tp_group().gather(input_, dst, dim) 26 | 27 | 28 | def broadcast_tensor_dict( 29 | tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0 30 | ): 31 | if not torch.distributed.is_initialized(): 32 | return tensor_dict 33 | return get_tp_group().broadcast_tensor_dict(tensor_dict, src) 34 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/distributed/device_communicators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/distributed/device_communicators/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/engine/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/engine/output_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/engine/output_processor/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/engine/output_processor/util.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from typing import Sequence as GenericSequence 3 | from typing import Union 4 | 5 | from vllm.model_executor.layers.sampler import SamplerOutput 6 | from vllm.sequence import PoolerOutput, SequenceGroupOutput 7 | 8 | 9 | def create_output_by_sequence_group( 10 | outputs: GenericSequence[Union[SamplerOutput, PoolerOutput]], num_seq_groups: int 11 | ) -> List[List[SequenceGroupOutput]]: 12 | """Helper method which transforms a 2d list organized by 13 | [step][sequence group] into [sequence group][step]. 14 | """ 15 | output_by_sequence_group: List[List[SequenceGroupOutput]] = [ 16 | [] for _ in range(num_seq_groups) 17 | ] 18 | for step in outputs: 19 | for i, sequence_group_output in enumerate(step): 20 | output_by_sequence_group[i].append(sequence_group_output) 21 | 22 | return output_by_sequence_group 23 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/entrypoints/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/entrypoints/openai/tool_parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstract_tool_parser import ToolParser, ToolParserManager 2 | from .hermes_tool_parser import Hermes2ProToolParser 3 | from .internlm2_tool_parser import Internlm2ToolParser 4 | from .llama_tool_parser import Llama3JsonToolParser 5 | from .mistral_tool_parser import MistralToolParser 6 | 7 | __all__ = [ 8 | "ToolParser", 9 | "ToolParserManager", 10 | "Hermes2ProToolParser", 11 | "MistralToolParser", 12 | "Internlm2ToolParser", 13 | "Llama3JsonToolParser", 14 | ] 15 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/executor/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/executor/msgspec_utils.py: -------------------------------------------------------------------------------- 1 | from array import array 2 | from typing import Any, Type 3 | 4 | from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE 5 | 6 | 7 | def encode_hook(obj: Any) -> Any: 8 | """Custom msgspec enc hook that supports array types. 9 | 10 | See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder 11 | """ 12 | if isinstance(obj, array): 13 | assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, ( 14 | f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. " 15 | f"Given array has a type code of {obj.typecode}." 16 | ) 17 | return obj.tobytes() 18 | 19 | 20 | def decode_hook(type: Type, obj: Any) -> Any: 21 | """Custom msgspec dec hook that supports array types. 22 | 23 | See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder 24 | """ 25 | if type is array: 26 | deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE) 27 | deserialized.frombytes(obj) 28 | return deserialized 29 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/executor/multiproc_xpu_executor.py: -------------------------------------------------------------------------------- 1 | import vllm.envs as envs 2 | from vllm.executor.multiproc_gpu_executor import ( 3 | MultiprocessingGPUExecutor, 4 | MultiprocessingGPUExecutorAsync, 5 | ) 6 | from vllm.executor.xpu_executor import XPUExecutor 7 | from vllm.logger import init_logger 8 | from vllm.utils import make_async 9 | 10 | logger = init_logger(__name__) 11 | 12 | 13 | class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor): 14 | """Python multiprocessing-based multi-XPU executor""" 15 | 16 | def _check_executor_parameters(self): 17 | mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD 18 | if mp_method != "spawn": 19 | raise RuntimeError( 20 | "XPU multiprocess executor only support spawn as mp method" 21 | ) 22 | 23 | 24 | class MultiprocessingXPUExecutorAsync( 25 | MultiprocessingXPUExecutor, MultiprocessingGPUExecutorAsync 26 | ): 27 | 28 | def __init__(self, *args, **kwargs): 29 | super().__init__(*args, **kwargs) 30 | self.driver_exec_model = make_async(self.driver_worker.execute_model) 31 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/executor/ray_xpu_executor.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import List, Optional 3 | 4 | import vllm.envs as envs 5 | from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync 6 | from vllm.executor.xpu_executor import XPUExecutor 7 | from vllm.logger import init_logger 8 | from vllm.utils import get_vllm_instance_id, make_async 9 | 10 | logger = init_logger(__name__) 11 | 12 | 13 | class RayXPUExecutor(RayGPUExecutor, XPUExecutor): 14 | 15 | def _get_env_vars_to_be_updated(self): 16 | # Get the set of GPU IDs used on each node. 17 | worker_node_and_gpu_ids = self._run_workers( 18 | "get_node_and_gpu_ids", use_dummy_driver=True 19 | ) 20 | 21 | VLLM_INSTANCE_ID = get_vllm_instance_id() 22 | 23 | # Set environment variables for the driver and workers. 24 | all_args_to_update_environment_variables = [ 25 | ( 26 | { 27 | "VLLM_INSTANCE_ID": VLLM_INSTANCE_ID, 28 | "VLLM_TRACE_FUNCTION": str(envs.VLLM_TRACE_FUNCTION), 29 | }, 30 | ) 31 | for (_, _) in worker_node_and_gpu_ids 32 | ] 33 | return all_args_to_update_environment_variables 34 | 35 | 36 | class RayXPUExecutorAsync(RayXPUExecutor, RayGPUExecutorAsync): 37 | 38 | def __init__(self, *args, **kwargs): 39 | super().__init__(*args, **kwargs) 40 | self.driver_exec_method = make_async(self.driver_worker.execute_method) 41 | self.pp_locks: Optional[List[asyncio.Lock]] = None 42 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/forward_context.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from typing import Any 3 | 4 | _forward_context: Any = None 5 | 6 | 7 | def get_forward_context() -> Any: 8 | """Get the current forward context.""" 9 | return _forward_context 10 | 11 | 12 | @contextmanager 13 | def set_forward_context(context: Any): 14 | """A context manager that stores the current forward context, 15 | can be attention metadata, etc.""" 16 | global _forward_context 17 | prev_context = _forward_context 18 | _forward_context = context 19 | try: 20 | yield 21 | finally: 22 | _forward_context = prev_context 23 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/inputs/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import ( 2 | EncoderDecoderLLMInputs, 3 | ExplicitEncoderDecoderPrompt, 4 | LLMInputs, 5 | PromptType, 6 | SingletonPrompt, 7 | TextPrompt, 8 | TokensPrompt, 9 | build_explicit_enc_dec_prompt, 10 | to_enc_dec_tuple_list, 11 | zip_enc_dec_prompts, 12 | ) 13 | from .registry import InputContext, InputRegistry 14 | 15 | INPUT_REGISTRY = InputRegistry() 16 | """ 17 | The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine` 18 | to dispatch data processing according to the target model. 19 | 20 | See also: 21 | :ref:`input_processing_pipeline` 22 | """ 23 | 24 | __all__ = [ 25 | "TextPrompt", 26 | "TokensPrompt", 27 | "PromptType", 28 | "SingletonPrompt", 29 | "ExplicitEncoderDecoderPrompt", 30 | "LLMInputs", 31 | "EncoderDecoderLLMInputs", 32 | "build_explicit_enc_dec_prompt", 33 | "to_enc_dec_tuple_list", 34 | "zip_enc_dec_prompts", 35 | "INPUT_REGISTRY", 36 | "InputContext", 37 | "InputRegistry", 38 | ] 39 | 40 | 41 | def __getattr__(name: str): 42 | if name == "PromptInput": 43 | import warnings 44 | 45 | msg = ( 46 | "PromptInput has been renamed to PromptType. " 47 | "The original name will be removed in an upcoming version." 48 | ) 49 | 50 | warnings.warn(DeprecationWarning(msg), stacklevel=2) 51 | 52 | return PromptType 53 | 54 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 55 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/logging/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.logging.formatter import NewLineFormatter 2 | 3 | __all__ = [ 4 | "NewLineFormatter", 5 | ] 6 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/logging/formatter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class NewLineFormatter(logging.Formatter): 5 | """Adds logging prefix to newlines to align multi-line messages.""" 6 | 7 | def __init__(self, fmt, datefmt=None, style="%"): 8 | logging.Formatter.__init__(self, fmt, datefmt, style) 9 | 10 | def format(self, record): 11 | msg = logging.Formatter.format(self, record) 12 | if record.message != "": 13 | parts = msg.split(record.message) 14 | msg = msg.replace("\n", "\r\n" + parts[0]) 15 | return msg 16 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/lora/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/lora/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/lora/ops/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.parameter import BasevLLMParameter, PackedvLLMParameter 2 | from vllm.model_executor.sampling_metadata import ( 3 | SamplingMetadata, 4 | SamplingMetadataCache, 5 | ) 6 | from vllm.model_executor.utils import set_random_seed 7 | 8 | __all__ = [ 9 | "SamplingMetadata", 10 | "SamplingMetadataCache", 11 | "set_random_seed", 12 | "BasevLLMParameter", 13 | "PackedvLLMParameter", 14 | ] 15 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/layers/fused_moe/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.layers.fused_moe.layer import ( 2 | FusedMoE, 3 | FusedMoEMethodBase, 4 | FusedMoeWeightScaleSupported, 5 | ) 6 | from vllm.triton_utils import HAS_TRITON 7 | 8 | __all__ = [ 9 | "FusedMoE", 10 | "FusedMoEMethodBase", 11 | "FusedMoeWeightScaleSupported", 12 | ] 13 | 14 | if HAS_TRITON: 15 | from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( 16 | fused_marlin_moe, 17 | single_marlin_moe, 18 | ) 19 | from vllm.model_executor.layers.fused_moe.fused_moe import ( 20 | fused_experts, 21 | fused_moe, 22 | fused_topk, 23 | get_config_file_name, 24 | grouped_topk, 25 | ) 26 | 27 | __all__ += [ 28 | "fused_marlin_moe", 29 | "single_marlin_moe", 30 | "fused_moe", 31 | "fused_topk", 32 | "fused_experts", 33 | "get_config_file_name", 34 | "grouped_topk", 35 | ] 36 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/layers/fused_moe/configs/README: -------------------------------------------------------------------------------- 1 | This directory contains tuned configurations for different settings of the fused_moe kernel. 2 | For different settings of 3 | - E (number of experts) 4 | - N (intermediate size) 5 | - device_name (torch.cuda.get_device_name()) 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration. 7 | 8 | The example configurations provided are for the Mixtral model for TP2 on H100 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have 10 | N = 7168 and for TP4 we have N = 3584. 11 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/layers/mamba/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/mamba/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/layers/mamba/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/mamba/ops/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py: -------------------------------------------------------------------------------- 1 | from .compressed_tensors_scheme import CompressedTensorsScheme 2 | from .compressed_tensors_w4a16_24 import ( 3 | W4A16SPARSE24_SUPPORTED_BITS, 4 | CompressedTensorsW4A16Sparse24, 5 | ) 6 | from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8 7 | from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8 8 | from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8 9 | from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16 10 | 11 | __all__ = [ 12 | "CompressedTensorsScheme", 13 | "CompressedTensorsWNA16", 14 | "CompressedTensorsW8A16Fp8", 15 | "CompressedTensorsW4A16Sparse24", 16 | "CompressedTensorsW8A8Int8", 17 | "CompressedTensorsW8A8Fp8", 18 | "WNA16_SUPPORTED_BITS", 19 | "W4A16SPARSE24_SUPPORTED_BITS", 20 | ] 21 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/layers/quantization/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .layer_utils import replace_parameter, update_tensor_inplace 2 | 3 | __all__ = ["update_tensor_inplace", "replace_parameter"] 4 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/layers/quantization/utils/machete_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import torch 4 | 5 | from vllm.scalar_type import ScalarType, scalar_types 6 | 7 | MACHETE_SUPPORTED_GROUP_SIZES = [-1, 128] 8 | MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128] 9 | 10 | 11 | def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]: 12 | if zero_points: 13 | return [scalar_types.uint4, scalar_types.uint8] 14 | else: 15 | return [scalar_types.uint4b8, scalar_types.uint8b128] 16 | 17 | 18 | def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]: 19 | return [torch.float16, torch.bfloat16] 20 | 21 | 22 | def check_machete_supports_shape( 23 | in_features: int, out_featrues: int 24 | ) -> Tuple[bool, Optional[str]]: 25 | if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0: 26 | return ( 27 | False, 28 | "Input features size must be divisible by " 29 | f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}", 30 | ) 31 | if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0: 32 | return ( 33 | False, 34 | "Output features size must be divisible by " 35 | f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}", 36 | ) 37 | return True, None 38 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/model_loader/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from torch import nn 4 | 5 | from vllm.config import ( 6 | CacheConfig, 7 | DeviceConfig, 8 | LoadConfig, 9 | LoRAConfig, 10 | ModelConfig, 11 | ParallelConfig, 12 | SchedulerConfig, 13 | ) 14 | from vllm.model_executor.model_loader.loader import BaseModelLoader, get_model_loader 15 | from vllm.model_executor.model_loader.utils import ( 16 | get_architecture_class_name, 17 | get_model_architecture, 18 | ) 19 | 20 | 21 | def get_model( 22 | *, 23 | model_config: ModelConfig, 24 | load_config: LoadConfig, 25 | device_config: DeviceConfig, 26 | parallel_config: ParallelConfig, 27 | scheduler_config: SchedulerConfig, 28 | lora_config: Optional[LoRAConfig], 29 | cache_config: CacheConfig 30 | ) -> nn.Module: 31 | loader = get_model_loader(load_config) 32 | return loader.load_model( 33 | model_config=model_config, 34 | device_config=device_config, 35 | lora_config=lora_config, 36 | parallel_config=parallel_config, 37 | scheduler_config=scheduler_config, 38 | cache_config=cache_config, 39 | ) 40 | 41 | 42 | __all__ = [ 43 | "get_model", 44 | "get_model_loader", 45 | "BaseModelLoader", 46 | "get_architecture_class_name", 47 | "get_model_architecture", 48 | ] 49 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/model_loader/utils.py: -------------------------------------------------------------------------------- 1 | """Utilities for selecting and loading models.""" 2 | 3 | import contextlib 4 | from typing import Tuple, Type 5 | 6 | import torch 7 | from torch import nn 8 | 9 | from vllm.config import ModelConfig 10 | from vllm.model_executor.models import ModelRegistry 11 | 12 | 13 | @contextlib.contextmanager 14 | def set_default_torch_dtype(dtype: torch.dtype): 15 | """Sets the default torch dtype to the given dtype.""" 16 | old_dtype = torch.get_default_dtype() 17 | torch.set_default_dtype(dtype) 18 | yield 19 | torch.set_default_dtype(old_dtype) 20 | 21 | 22 | def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: 23 | architectures = getattr(model_config.hf_config, "architectures", []) 24 | # Special handling for quantized Mixtral. 25 | # FIXME(woosuk): This is a temporary hack. 26 | mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"] 27 | 28 | if ( 29 | model_config.quantization is not None 30 | and model_config.quantization not in mixtral_supported 31 | and "MixtralForCausalLM" in architectures 32 | ): 33 | architectures = ["QuantMixtralForCausalLM"] 34 | 35 | return ModelRegistry.resolve_model_cls(architectures) 36 | 37 | 38 | def get_architecture_class_name(model_config: ModelConfig) -> str: 39 | return get_model_architecture(model_config)[1] 40 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .interfaces import ( 2 | HasInnerState, 3 | SupportsLoRA, 4 | SupportsMultiModal, 5 | SupportsPP, 6 | has_inner_state, 7 | supports_lora, 8 | supports_multimodal, 9 | supports_pp, 10 | ) 11 | from .interfaces_base import ( 12 | VllmModelForEmbedding, 13 | VllmModelForTextGeneration, 14 | is_embedding_model, 15 | is_text_generation_model, 16 | ) 17 | from .registry import ModelRegistry 18 | 19 | __all__ = [ 20 | "ModelRegistry", 21 | "VllmModelForEmbedding", 22 | "is_embedding_model", 23 | "VllmModelForTextGeneration", 24 | "is_text_generation_model", 25 | "HasInnerState", 26 | "has_inner_state", 27 | "SupportsLoRA", 28 | "supports_lora", 29 | "SupportsMultiModal", 30 | "supports_multimodal", 31 | "SupportsPP", 32 | "supports_pp", 33 | ] 34 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/models/phi3.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Adapted from llama.py 3 | """Inference-only Phi3 model code inherit from Llama.py""" 4 | 5 | from vllm.model_executor.models.llama import LlamaForCausalLM 6 | 7 | 8 | class Phi3ForCausalLM(LlamaForCausalLM): 9 | 10 | packed_modules_mapping = { 11 | "qkv_proj": [ 12 | "qkv_proj", 13 | ], 14 | "gate_up_proj": [ 15 | "gate_up_proj", 16 | ], 17 | } 18 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/model_executor/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for model executor.""" 2 | 3 | from typing import Any, Dict, Optional 4 | 5 | import torch 6 | 7 | from vllm.utils import seed_everything 8 | 9 | 10 | def set_random_seed(seed: int) -> None: 11 | seed_everything(seed) 12 | 13 | 14 | def set_weight_attrs( 15 | weight: torch.Tensor, 16 | weight_attrs: Optional[Dict[str, Any]], 17 | ): 18 | """Set attributes on a weight tensor. 19 | 20 | This method is used to set attributes on a weight tensor. This method 21 | will not overwrite existing attributes. 22 | 23 | Args: 24 | weight: The weight tensor. 25 | weight_attrs: A dictionary of attributes to set on the weight tensor. 26 | """ 27 | if weight_attrs is None: 28 | return 29 | for key, value in weight_attrs.items(): 30 | assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}" 31 | setattr(weight, key, value) 32 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/multimodal/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import ( 2 | BatchedTensorInputs, 3 | MultiModalDataBuiltins, 4 | MultiModalDataDict, 5 | MultiModalInputs, 6 | MultiModalPlugin, 7 | NestedTensors, 8 | ) 9 | from .registry import MultiModalRegistry 10 | 11 | MULTIMODAL_REGISTRY = MultiModalRegistry() 12 | """ 13 | The global :class:`~MultiModalRegistry` is used by model runners to 14 | dispatch data processing according to its modality and the target model. 15 | 16 | See also: 17 | :ref:`input_processing_pipeline` 18 | """ 19 | 20 | __all__ = [ 21 | "BatchedTensorInputs", 22 | "MultiModalDataBuiltins", 23 | "MultiModalDataDict", 24 | "MultiModalInputs", 25 | "MultiModalPlugin", 26 | "NestedTensors", 27 | "MULTIMODAL_REGISTRY", 28 | "MultiModalRegistry", 29 | ] 30 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/multimodal/audio.py: -------------------------------------------------------------------------------- 1 | from vllm.inputs.registry import InputContext 2 | from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin 3 | 4 | 5 | class AudioPlugin(MultiModalPlugin): 6 | """Plugin for audio data.""" 7 | 8 | def get_data_key(self) -> str: 9 | return "audio" 10 | 11 | def _default_input_mapper( 12 | self, ctx: InputContext, data: object, **mm_processor_kwargs 13 | ) -> MultiModalInputs: 14 | raise NotImplementedError("There is no default audio input mapper") 15 | 16 | def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: 17 | raise NotImplementedError("There is no default maximum multimodal tokens") 18 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/platforms/cpu.py: -------------------------------------------------------------------------------- 1 | import psutil 2 | import torch 3 | 4 | from .interface import Platform, PlatformEnum 5 | 6 | 7 | class CpuPlatform(Platform): 8 | _enum = PlatformEnum.CPU 9 | 10 | @classmethod 11 | def get_device_name(cls, device_id: int = 0) -> str: 12 | return "cpu" 13 | 14 | @classmethod 15 | def get_device_total_memory(cls, device_id: int = 0) -> int: 16 | return psutil.virtual_memory().total 17 | 18 | @classmethod 19 | def inference_mode(cls): 20 | return torch.no_grad() 21 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/platforms/rocm.py: -------------------------------------------------------------------------------- 1 | import os 2 | from functools import lru_cache 3 | 4 | import torch 5 | 6 | from vllm.logger import init_logger 7 | 8 | from .interface import DeviceCapability, Platform, PlatformEnum 9 | 10 | logger = init_logger(__name__) 11 | 12 | if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]: 13 | logger.warning( 14 | "`fork` method is not supported by ROCm. " 15 | "VLLM_WORKER_MULTIPROC_METHOD is overridden to" 16 | " `spawn` instead." 17 | ) 18 | os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" 19 | 20 | 21 | class RocmPlatform(Platform): 22 | _enum = PlatformEnum.ROCM 23 | 24 | @classmethod 25 | @lru_cache(maxsize=8) 26 | def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: 27 | major, minor = torch.cuda.get_device_capability(device_id) 28 | return DeviceCapability(major=major, minor=minor) 29 | 30 | @classmethod 31 | @lru_cache(maxsize=8) 32 | def get_device_name(cls, device_id: int = 0) -> str: 33 | return torch.cuda.get_device_name(device_id) 34 | 35 | @classmethod 36 | def get_device_total_memory(cls, device_id: int = 0) -> int: 37 | device_props = torch.cuda.get_device_properties(device_id) 38 | return device_props.total_memory 39 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/platforms/tpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .interface import Platform, PlatformEnum 4 | 5 | 6 | class TpuPlatform(Platform): 7 | _enum = PlatformEnum.TPU 8 | 9 | @classmethod 10 | def get_device_name(cls, device_id: int = 0) -> str: 11 | raise NotImplementedError 12 | 13 | @classmethod 14 | def get_device_total_memory(cls, device_id: int = 0) -> int: 15 | raise NotImplementedError 16 | 17 | @classmethod 18 | def inference_mode(cls): 19 | return torch.no_grad() 20 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/platforms/xpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .interface import DeviceCapability, Platform, PlatformEnum 4 | 5 | 6 | class XPUPlatform(Platform): 7 | _enum = PlatformEnum.XPU 8 | 9 | @staticmethod 10 | def get_device_capability(device_id: int = 0) -> DeviceCapability: 11 | major, minor, *_ = torch.xpu.get_device_capability(device_id)["version"].split( 12 | "." 13 | ) 14 | return DeviceCapability(major=int(major), minor=int(minor)) 15 | 16 | @staticmethod 17 | def get_device_name(device_id: int = 0) -> str: 18 | return torch.xpu.get_device_name(device_id) 19 | 20 | @classmethod 21 | def get_device_total_memory(cls, device_id: int = 0) -> int: 22 | device_props = torch.xpu.get_device_properties(device_id) 23 | return device_props.total_memory 24 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Callable, Optional, Union 3 | 4 | import vllm.envs as envs 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def load_general_plugins(): 10 | """WARNING: plugins can be loaded for multiple times in different 11 | processes. They should be designed in a way that they can be loaded 12 | multiple times without causing issues. 13 | """ 14 | import sys 15 | 16 | if sys.version_info < (3, 10): 17 | from importlib_metadata import entry_points 18 | else: 19 | from importlib.metadata import entry_points 20 | 21 | allowed_plugins = envs.VLLM_PLUGINS 22 | 23 | discovered_plugins = entry_points(group="vllm.general_plugins") 24 | for plugin in discovered_plugins: 25 | logger.info("Found general plugin: %s", plugin.name) 26 | if allowed_plugins is None or plugin.name in allowed_plugins: 27 | try: 28 | func = plugin.load() 29 | func() 30 | logger.info("Loaded general plugin: %s", plugin.name) 31 | except Exception: 32 | logger.exception("Failed to load general plugin: %s", plugin.name) 33 | 34 | 35 | _torch_compile_backend: Optional[Union[Callable, str]] = None 36 | 37 | 38 | def set_torch_compile_backend(backend: Union[Callable, str]): 39 | global _torch_compile_backend 40 | _torch_compile_backend = backend 41 | 42 | 43 | def get_torch_compile_backend() -> Optional[Union[Callable, str]]: 44 | return _torch_compile_backend 45 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/pooling_params.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | 3 | import msgspec 4 | 5 | 6 | class PoolingParams( 7 | msgspec.Struct, omit_defaults=True, array_like=True # type: ignore[call-arg] 8 | ): # type: ignore[call-arg] 9 | """Pooling parameters for pooling. 10 | 11 | Attributes: 12 | additional_data: Any additional data needed for pooling. 13 | """ 14 | 15 | additional_data: Optional[Any] = None 16 | 17 | def clone(self) -> "PoolingParams": 18 | """Returns a deep copy of the PoolingParams instance.""" 19 | return PoolingParams( 20 | additional_data=self.additional_data, 21 | ) 22 | 23 | def __repr__(self) -> str: 24 | return f"PoolingParams(" f"additional_metadata={self.additional_data})" 25 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/prompt_adapter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/prompt_adapter/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/prompt_adapter/request.py: -------------------------------------------------------------------------------- 1 | import msgspec 2 | 3 | from vllm.adapter_commons.request import AdapterRequest 4 | 5 | 6 | class PromptAdapterRequest( 7 | msgspec.Struct, 8 | array_like=True, # type: ignore[call-arg] 9 | omit_defaults=True, # type: ignore[call-arg] 10 | frozen=True, 11 | ): # type: ignore[call-arg] 12 | """ 13 | Request for a Prompt adapter. 14 | """ 15 | 16 | __metaclass__ = AdapterRequest 17 | 18 | prompt_adapter_name: str 19 | prompt_adapter_id: int 20 | prompt_adapter_local_path: str 21 | prompt_adapter_num_virtual_tokens: int 22 | 23 | def __hash__(self): 24 | return super().__hash__() 25 | 26 | @property 27 | def adapter_id(self): 28 | return self.prompt_adapter_id 29 | 30 | @property 31 | def name(self): 32 | return self.prompt_adapter_name 33 | 34 | @property 35 | def local_path(self): 36 | return self.prompt_adapter_local_path 37 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | # The vllm package uses inline types. 3 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/scalar_type.py: -------------------------------------------------------------------------------- 1 | from ._core_ext import NanRepr, ScalarType 2 | 3 | # naming generally follows: https://github.com/jax-ml/ml_dtypes 4 | # for floating point types (leading f) the scheme is: 5 | # `float_em[flags]` 6 | # flags: 7 | # - no-flags: means it follows IEEE 754 conventions 8 | # - f: means finite values only (no infinities) 9 | # - n: means nans are supported (non-standard encoding) 10 | # for integer types the scheme is: 11 | # `[u]int[b]` 12 | # - if bias is not present it means its zero 13 | 14 | 15 | class scalar_types: 16 | int4 = ScalarType.int_(4, None) 17 | uint4 = ScalarType.uint(4, None) 18 | int8 = ScalarType.int_(8, None) 19 | uint8 = ScalarType.uint(8, None) 20 | float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN.value) 21 | float8_e5m2 = ScalarType.float_IEEE754(5, 2) 22 | float16_e8m7 = ScalarType.float_IEEE754(8, 7) 23 | float16_e5m10 = ScalarType.float_IEEE754(5, 10) 24 | 25 | # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main 26 | float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE.value) 27 | 28 | # "gptq" types 29 | uint4b8 = ScalarType.uint(4, 8) 30 | uint8b128 = ScalarType.uint(8, 128) 31 | 32 | # colloquial names 33 | bfloat16 = float16_e8m7 34 | float16 = float16_e5m10 35 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/spec_decode/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/tokenformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/tokenformer/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/transformers_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.envs import VLLM_USE_MODELSCOPE 2 | 3 | if VLLM_USE_MODELSCOPE: 4 | # Patch here, before each import happens 5 | import modelscope 6 | from packaging import version 7 | 8 | # patch_hub begins from modelscope>=1.18.1 9 | if version.parse(modelscope.__version__) <= version.parse("1.18.0"): 10 | raise ImportError( 11 | "Using vLLM with ModelScope needs modelscope>=1.18.1, please " 12 | "install by `pip install modelscope>=1.18.1`" 13 | ) 14 | 15 | from modelscope.utils.hf_util import patch_hub 16 | 17 | # Patch hub to download models from modelscope to speed up. 18 | patch_hub() 19 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/transformers_utils/configs/mllama.py: -------------------------------------------------------------------------------- 1 | from transformers.models.mllama import configuration_mllama as mllama_hf_config 2 | 3 | 4 | class MllamaTextConfig(mllama_hf_config.MllamaTextConfig): 5 | """ 6 | Use this class to override is_encoder_decoder: 7 | - transformers regards mllama as is_encoder_decoder=False 8 | - vllm needs is_encoder_decoder=True to enable cross-attention 9 | """ 10 | 11 | def __init__( 12 | self, 13 | **kwargs, 14 | ): 15 | super().__init__(**kwargs) 16 | self.is_encoder_decoder = True 17 | 18 | 19 | class MllamaConfig(mllama_hf_config.MllamaConfig): 20 | 21 | def __init__( 22 | self, 23 | text_config=None, 24 | **kwargs, 25 | ): 26 | if isinstance(text_config, dict): 27 | text_config = MllamaTextConfig(**text_config) 28 | super().__init__(text_config=text_config, **kwargs) 29 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/transformers_utils/configs/nvlm_d.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py 3 | # -------------------------------------------------------- 4 | # NVLM-D 5 | # Copyright (c) 2024 NVIDIA 6 | # Licensed under Apache 2.0 License [see LICENSE for details] 7 | # -------------------------------------------------------- 8 | from .internvl import InternVLChatConfig 9 | 10 | 11 | class NVLM_D_Config(InternVLChatConfig): 12 | model_type = "NVLM_D" 13 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/transformers_utils/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .mistral import MistralTokenizer 2 | 3 | __all__ = ["MistralTokenizer"] 4 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/transformers_utils/utils.py: -------------------------------------------------------------------------------- 1 | from os import PathLike 2 | from pathlib import Path 3 | from typing import Union 4 | 5 | 6 | def check_gguf_file(model: Union[str, PathLike]) -> bool: 7 | """Check if the file is a GGUF model.""" 8 | model = Path(model) 9 | if not model.is_file(): 10 | return False 11 | elif model.suffix == ".gguf": 12 | return True 13 | 14 | with open(model, "rb") as f: 15 | header = f.read(4) 16 | return header == b"GGUF" 17 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/triton_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.triton_utils.importing import HAS_TRITON 2 | 3 | __all__ = ["HAS_TRITON"] 4 | 5 | if HAS_TRITON: 6 | 7 | from vllm.triton_utils.custom_cache_manager import maybe_set_triton_cache_manager 8 | from vllm.triton_utils.libentry import libentry 9 | 10 | __all__ += ["maybe_set_triton_cache_manager", "libentry"] 11 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/triton_utils/importing.py: -------------------------------------------------------------------------------- 1 | from importlib.util import find_spec 2 | 3 | from vllm.logger import init_logger 4 | 5 | logger = init_logger(__name__) 6 | 7 | HAS_TRITON = find_spec("triton") is not None 8 | 9 | if not HAS_TRITON: 10 | logger.info( 11 | "Triton not installed; certain GPU-related functions" " will not be available." 12 | ) 13 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/usage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/usage/__init__.py -------------------------------------------------------------------------------- /infra/cray_infra/vllm/version.py: -------------------------------------------------------------------------------- 1 | try: 2 | from ._version import __version__, __version_tuple__ 3 | except Exception as e: 4 | import warnings 5 | 6 | warnings.warn(f"Failed to read commit hash:\n{e}", RuntimeWarning, stacklevel=2) 7 | 8 | __version__ = "dev" 9 | __version_tuple__ = (0, 0, __version__) 10 | -------------------------------------------------------------------------------- /infra/cray_infra/vllm/vllm_flash_attn/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/vllm_flash_attn/.gitkeep -------------------------------------------------------------------------------- /infra/cray_infra/vllm/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/worker/__init__.py -------------------------------------------------------------------------------- /infra/csrc/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | #include "dtype_float16.cuh" 5 | #include "dtype_float32.cuh" 6 | #include "dtype_bfloat16.cuh" 7 | #include "dtype_fp8.cuh" 8 | -------------------------------------------------------------------------------- /infra/csrc/attention/dtype_fp8.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | 5 | #include 6 | #ifdef ENABLE_FP8 7 | #ifndef USE_ROCM 8 | #include 9 | #endif // USE_ROCM 10 | #endif // ENABLE_FP8 11 | 12 | namespace vllm { 13 | 14 | enum class Fp8KVCacheDataType { 15 | kAuto = 0, 16 | kFp8E4M3 = 1, 17 | kFp8E5M2 = 2, 18 | }; 19 | 20 | // fp8 vector types for quantization of kv cache 21 | template <> 22 | struct Vec { 23 | using Type = uint8_t; 24 | }; 25 | 26 | template <> 27 | struct Vec { 28 | using Type = uint16_t; 29 | }; 30 | 31 | template <> 32 | struct Vec { 33 | using Type = uint32_t; 34 | }; 35 | 36 | template <> 37 | struct Vec { 38 | using Type = uint2; 39 | }; 40 | 41 | } // namespace vllm 42 | -------------------------------------------------------------------------------- /infra/csrc/cache.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | void swap_blocks(torch::Tensor& src, torch::Tensor& dst, 9 | const torch::Tensor& block_mapping); 10 | 11 | // Note: the key_caches and value_caches vectors are constant but 12 | // not the Tensors they contain. The vectors need to be const refs 13 | // in order to satisfy pytorch's C++ operator registration code. 14 | void copy_blocks(std::vector const& key_caches, 15 | std::vector const& value_caches, 16 | const torch::Tensor& block_mapping); 17 | 18 | void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, 19 | torch::Tensor& key_cache, torch::Tensor& value_cache, 20 | torch::Tensor& slot_mapping, 21 | const std::string& kv_cache_dtype, const double k_scale, 22 | const double v_scale); 23 | 24 | void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value, 25 | torch::Tensor& key_cache, 26 | torch::Tensor& value_cache, 27 | torch::Tensor& slot_mapping, 28 | const std::string& kv_cache_dtype, 29 | const double k_scale, const double v_scale); 30 | 31 | // Just for unittest 32 | void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, 33 | const double scale, const std::string& kv_cache_dtype); 34 | -------------------------------------------------------------------------------- /infra/csrc/core/exception.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define VLLM_IMPLIES(p, q) (!(p) || (q)) 4 | -------------------------------------------------------------------------------- /infra/csrc/core/registration.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #define _CONCAT(A, B) A##B 6 | #define CONCAT(A, B) _CONCAT(A, B) 7 | 8 | #define _STRINGIFY(A) #A 9 | #define STRINGIFY(A) _STRINGIFY(A) 10 | 11 | // A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME 12 | // could be a macro instead of a literal token. 13 | #define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE) 14 | 15 | // A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME 16 | // could be a macro instead of a literal token. 17 | #define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \ 18 | TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE) 19 | 20 | // REGISTER_EXTENSION allows the shared library to be loaded and initialized 21 | // via python's import statement. 22 | #define REGISTER_EXTENSION(NAME) \ 23 | PyMODINIT_FUNC CONCAT(PyInit_, NAME)() { \ 24 | static struct PyModuleDef module = {PyModuleDef_HEAD_INIT, \ 25 | STRINGIFY(NAME), nullptr, 0, nullptr}; \ 26 | return PyModule_Create(&module); \ 27 | } 28 | -------------------------------------------------------------------------------- /infra/csrc/core/torch_bindings.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "scalar_type.hpp" 4 | #include "registration.h" 5 | 6 | // Note the CORE exstension will be built for (almost) all hardware targets so 7 | // new additions must account for this. (currently not built for TPU and Neuron) 8 | 9 | TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) { 10 | // ScalarType, a custom class for representing data types that supports 11 | // quantized types, declared here so it can be used when creating interfaces 12 | // for custom ops. 13 | vllm::ScalarTypeTorch::bind_class(lib); 14 | } 15 | 16 | REGISTER_EXTENSION(TORCH_EXTENSION_NAME) 17 | -------------------------------------------------------------------------------- /infra/csrc/cpu/cpu_types.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPU_TYPES_HPP 2 | #define CPU_TYPES_HPP 3 | 4 | #if defined(__x86_64__) 5 | //x86 implementation 6 | #include "cpu_types_x86.hpp" 7 | #elif defined(__POWER9_VECTOR__) 8 | //ppc implementation 9 | #include "cpu_types_vsx.hpp" 10 | #elif defined(__aarch64__) 11 | //arm implementation 12 | #include "cpu_types_arm.hpp" 13 | #else 14 | #warning "unsupported vLLM cpu implementation" 15 | #endif 16 | 17 | #endif -------------------------------------------------------------------------------- /infra/csrc/cuda_compat.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef USE_ROCM 4 | #include 5 | #endif 6 | 7 | #ifndef USE_ROCM 8 | #define WARP_SIZE 32 9 | #else 10 | #define WARP_SIZE warpSize 11 | #endif 12 | 13 | #ifndef USE_ROCM 14 | #define VLLM_LDG(arg) __ldg(arg) 15 | #else 16 | #define VLLM_LDG(arg) *(arg) 17 | #endif 18 | 19 | #ifndef USE_ROCM 20 | #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \ 21 | __shfl_xor_sync(uint32_t(-1), var, lane_mask) 22 | #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \ 23 | __shfl_xor_sync(uint32_t(-1), var, lane_mask, width) 24 | #else 25 | #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) 26 | #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \ 27 | __shfl_xor(var, lane_mask, width) 28 | #endif 29 | 30 | #ifndef USE_ROCM 31 | #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) 32 | #else 33 | #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) 34 | #endif 35 | 36 | #ifndef USE_ROCM 37 | #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \ 38 | __shfl_down_sync(uint32_t(-1), var, lane_delta) 39 | #else 40 | #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta) 41 | #endif 42 | 43 | #ifndef USE_ROCM 44 | #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ 45 | cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL) 46 | #else 47 | #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ 48 | hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) 49 | #endif 50 | -------------------------------------------------------------------------------- /infra/csrc/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA) 4 | #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__ 5 | #define DEVICE_INLINE __forceinline__ __device__ 6 | #define HOST_INLINE __forceinline__ __host__ 7 | #else 8 | #define HOST_DEVICE_INLINE inline 9 | #define DEVICE_INLINE inline 10 | #define HOST_INLINE inline 11 | #endif 12 | 13 | int64_t get_device_attribute(int64_t attribute, int64_t device_id); 14 | 15 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id); 16 | -------------------------------------------------------------------------------- /infra/csrc/cuda_utils_kernels.cu: -------------------------------------------------------------------------------- 1 | #ifdef USE_ROCM 2 | #include 3 | #include 4 | #endif 5 | int64_t get_device_attribute(int64_t attribute, int64_t device_id) { 6 | int device, value; 7 | if (device_id < 0) { 8 | cudaGetDevice(&device); 9 | } else { 10 | device = device_id; 11 | } 12 | cudaDeviceGetAttribute(&value, static_cast(attribute), 13 | device); 14 | return value; 15 | } 16 | 17 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) { 18 | int64_t attribute; 19 | // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html 20 | // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 21 | 22 | #ifdef USE_ROCM 23 | attribute = hipDeviceAttributeMaxSharedMemoryPerBlock; 24 | #else 25 | attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin; 26 | #endif 27 | 28 | return get_device_attribute(attribute, device_id); 29 | } 30 | -------------------------------------------------------------------------------- /infra/csrc/dispatch_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from 3 | * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h 4 | */ 5 | #pragma once 6 | 7 | #include 8 | 9 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ 10 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 11 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 12 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) 13 | 14 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ 15 | AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) 16 | 17 | #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \ 18 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 19 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 20 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ 21 | AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) 22 | 23 | #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \ 24 | AT_DISPATCH_SWITCH(TYPE, NAME, \ 25 | VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__)) 26 | 27 | #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \ 28 | AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ 29 | AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ 30 | AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ 31 | AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ 32 | AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) 33 | 34 | #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ 35 | AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) 36 | -------------------------------------------------------------------------------- /infra/csrc/mamba/causal_conv1d/static_switch.h: -------------------------------------------------------------------------------- 1 | // Inspired by 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h 3 | // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h 4 | // clang-format off 5 | // adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/static_switch.h 6 | 7 | #pragma once 8 | 9 | /// @param COND - a boolean expression to switch by 10 | /// @param CONST_NAME - a name given for the constexpr bool variable. 11 | /// @param ... - code to execute for true and false 12 | /// 13 | /// Usage: 14 | /// ``` 15 | /// BOOL_SWITCH(flag, BoolConst, [&] { 16 | /// some_function(...); 17 | /// }); 18 | /// ``` 19 | #define BOOL_SWITCH(COND, CONST_NAME, ...) \ 20 | [&] { \ 21 | if (COND) { \ 22 | static constexpr bool CONST_NAME = true; \ 23 | return __VA_ARGS__(); \ 24 | } else { \ 25 | static constexpr bool CONST_NAME = false; \ 26 | return __VA_ARGS__(); \ 27 | } \ 28 | }() 29 | -------------------------------------------------------------------------------- /infra/csrc/mamba/mamba_ssm/static_switch.h: -------------------------------------------------------------------------------- 1 | // Inspired by 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h 3 | // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h 4 | 5 | // clang-format off 6 | // adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/static_switch.h 7 | #pragma once 8 | 9 | /// @param COND - a boolean expression to switch by 10 | /// @param CONST_NAME - a name given for the constexpr bool variable. 11 | /// @param ... - code to execute for true and false 12 | /// 13 | /// Usage: 14 | /// ``` 15 | /// BOOL_SWITCH(flag, BoolConst, [&] { 16 | /// some_function(...); 17 | /// }); 18 | /// ``` 19 | #define BOOL_SWITCH(COND, CONST_NAME, ...) \ 20 | [&] { \ 21 | if (COND) { \ 22 | constexpr bool CONST_NAME = true; \ 23 | return __VA_ARGS__(); \ 24 | } else { \ 25 | constexpr bool CONST_NAME = false; \ 26 | return __VA_ARGS__(); \ 27 | } \ 28 | }() 29 | -------------------------------------------------------------------------------- /infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu: -------------------------------------------------------------------------------- 1 | #include "marlin_moe_kernel_ku4.h" 2 | 3 | namespace marlin_moe { 4 | 5 | // We return bool so we can create these different kernel calls as a sequence 6 | // of if-elseif's. 7 | bool call_marlin_moe_kernel_ku4( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks) { 17 | bool has_zp = true; 18 | 19 | if (false) { 20 | } 21 | AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256) 22 | AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256) 23 | AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128) 24 | AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128) 25 | else { 26 | return false; 27 | } 28 | return true; 29 | } 30 | 31 | } // namespace marlin_moe 32 | -------------------------------------------------------------------------------- /infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "marlin_moe_kernel.h" 4 | 5 | namespace marlin_moe { 6 | 7 | // We return bool so we can create these different kernel calls as a sequence 8 | // of if-elseif's. 9 | bool call_marlin_moe_kernel_ku4( 10 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 11 | bool has_act_order, int group_blocks, int num_threads, int blocks, 12 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 13 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 14 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 15 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 16 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 17 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 18 | int m_block, int max_par, int cfg_max_m_blocks); 19 | 20 | } // namespace marlin_moe 21 | -------------------------------------------------------------------------------- /infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu: -------------------------------------------------------------------------------- 1 | #include "marlin_moe_kernel_ku4b8.h" 2 | 3 | namespace marlin_moe { 4 | 5 | // We return bool so we can create these different kernel calls as a sequence 6 | // of if-elseif's. 7 | bool call_marlin_moe_kernel_ku4b8( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks) { 17 | bool has_zp = false; 18 | 19 | if (false) { 20 | } 21 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256) 22 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256) 23 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128) 24 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128) 25 | else { 26 | return false; 27 | } 28 | return true; 29 | } 30 | 31 | } // namespace marlin_moe 32 | -------------------------------------------------------------------------------- /infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "marlin_moe_kernel.h" 4 | 5 | namespace marlin_moe { 6 | 7 | // We return bool so we can create these different kernel calls as a sequence 8 | // of if-elseif's. 9 | bool call_marlin_moe_kernel_ku4b8( 10 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 11 | bool has_act_order, int group_blocks, int num_threads, int blocks, 12 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 13 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 14 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 15 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 16 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 17 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 18 | int m_block, int max_par, int cfg_max_m_blocks); 19 | 20 | } // namespace marlin_moe 21 | -------------------------------------------------------------------------------- /infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu: -------------------------------------------------------------------------------- 1 | #include "marlin_moe_kernel_ku8b128.h" 2 | 3 | namespace marlin_moe { 4 | 5 | // We return bool so we can create these different kernel calls as a sequence 6 | // of if-elseif's. 7 | bool call_marlin_moe_kernel_ku8b128( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks) { 17 | bool has_zp = false; 18 | 19 | if (false) { 20 | } 21 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256) 22 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256) 23 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128) 24 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128) 25 | else { 26 | return false; 27 | } 28 | return true; 29 | } 30 | 31 | } // namespace marlin_moe 32 | -------------------------------------------------------------------------------- /infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "marlin_moe_kernel.h" 4 | 5 | namespace marlin_moe { 6 | 7 | bool call_marlin_moe_kernel_ku8b128( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks); 17 | 18 | } 19 | -------------------------------------------------------------------------------- /infra/csrc/moe/moe_ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, 6 | torch::Tensor& token_expert_indices, 7 | torch::Tensor& gating_output); 8 | -------------------------------------------------------------------------------- /infra/csrc/moe/torch_bindings.cpp: -------------------------------------------------------------------------------- 1 | #include "core/registration.h" 2 | #include "moe_ops.h" 3 | 4 | TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { 5 | // Apply topk softmax to the gating outputs. 6 | m.def( 7 | "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! " 8 | "token_expert_indices, Tensor gating_output) -> ()"); 9 | m.impl("topk_softmax", torch::kCUDA, &topk_softmax); 10 | 11 | #ifndef USE_ROCM 12 | m.def( 13 | "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " 14 | "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! " 15 | "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, " 16 | "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, " 17 | "int size_n, int size_k, bool is_k_full, int num_experts, int topk, " 18 | "int moe_block_size, bool replicate_input, bool apply_weights)" 19 | " -> Tensor"); 20 | // conditionally compiled so impl registration is in source file 21 | #endif 22 | } 23 | 24 | REGISTER_EXTENSION(TORCH_EXTENSION_NAME) 25 | -------------------------------------------------------------------------------- /infra/csrc/prepare_inputs/advance_step.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace prepare_inputs { 13 | 14 | static constexpr int max_threads = 256; 15 | static constexpr bool logging = false; 16 | 17 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } 18 | 19 | } // namespace prepare_inputs 20 | -------------------------------------------------------------------------------- /infra/csrc/quantization/cutlass_w8a8/common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cutlass/cutlass.h" 4 | #include 5 | 6 | /** 7 | * Helper function for checking CUTLASS errors 8 | */ 9 | #define CUTLASS_CHECK(status) \ 10 | { \ 11 | TORCH_CHECK(status == cutlass::Status::kSuccess, \ 12 | cutlassGetStatusString(status)) \ 13 | } 14 | 15 | inline uint32_t next_pow_2(uint32_t const num) { 16 | if (num <= 1) return num; 17 | return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); 18 | } 19 | 20 | inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { 21 | int max_shared_mem_per_block_opt_in = 0; 22 | cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, 23 | cudaDevAttrMaxSharedMemoryPerBlockOptin, 24 | device); 25 | return max_shared_mem_per_block_opt_in; 26 | } 27 | 28 | -------------------------------------------------------------------------------- /infra/csrc/quantization/gptq/qdq_8.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_8_cuh 6 | #define _qdq_8_cuh 7 | 8 | #include "qdq_util.cuh" 9 | 10 | namespace vllm { 11 | namespace gptq { 12 | 13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {} 14 | 15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0, 16 | const uint32_t q_1, 17 | half2 (&dq)[4], int stride, 18 | const uint32_t zero) { 19 | half dqh[8]; 20 | for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero); 21 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); 22 | 23 | for (int i = 0; i < 4; i++) 24 | dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 25 | } 26 | 27 | } // namespace gptq 28 | } // namespace vllm 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /infra/csrc/quantization/machete/machete_collective_builder.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cutlass_extensions/vllm_collective_builder.cuh" 4 | #include "machete_mainloop.cuh" 5 | 6 | namespace cutlass::gemm::collective { 7 | using namespace cute; 8 | 9 | struct MacheteKernelTag {}; 10 | 11 | template 15 | struct VLLMCollectiveBuilder< 16 | MacheteKernelTag, arch::Sm90, arch::OpClassTensorOp, ElementPairA_, 17 | GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_, AlignmentB, 18 | ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType, 19 | KernelScheduleType, 20 | cute::enable_if_t<( 21 | cute::is_same_v || 23 | cute::is_same_v || 25 | cute::is_same_v)>> { 27 | using CollectiveOp = machete::MacheteCollectiveMma< 28 | ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_, 29 | AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, 30 | StageCountType, KernelScheduleType>; 31 | }; 32 | 33 | }; // namespace cutlass::gemm::collective -------------------------------------------------------------------------------- /infra/csrc/quantization/machete/machete_interleaving_utils.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cutlass/cutlass.h" 4 | #include "cute/layout.hpp" 5 | 6 | namespace machete { 7 | 8 | using namespace cute; 9 | 10 | // get an interleaved block layout where each element consecutive element has a 11 | // stride of bit_stride and the block width is blk_bit_width, 12 | // examples: 13 | // size_bits = 8, bit_stride = 8, blk_bit_width = 32 -> 4:1 14 | // size_bits = 8, bit_stride = 16, blk_bit_width = 32 -> (2, 2):(2, 1) 15 | // size_bits = 4, bit_stride = 8, blk_bit_width = 32 -> (4, 2):(2, 1) 16 | // size_bits = 4, bit_stride = 16, blk_bit_width = 32 -> (2, 4):(4, 1) 17 | template 18 | CUTE_HOST_DEVICE static constexpr auto get_interleaved_blk_layout() { 19 | static_assert(blk_bit_width % bit_stride == 0); 20 | static_assert(bit_stride % cute::sizeof_bits_v == 0); 21 | 22 | constexpr auto elems_per_blk = blk_bit_width / cute::sizeof_bits_v; 23 | 24 | if constexpr (cute::sizeof_bits_v == bit_stride) { 25 | // identity layout 26 | return Layout>>{}; 27 | } else { 28 | constexpr auto elems_per_stride = bit_stride / cute::sizeof_bits_v; 29 | constexpr auto num_strides = elems_per_blk / elems_per_stride; 30 | return Layout, Int>, 31 | Stride, Int<1>>>{}; 32 | } 33 | } 34 | 35 | }; // namespace machete 36 | -------------------------------------------------------------------------------- /infra/csrc/quantization/marlin/dense/common/base.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Modified by HandH1998 3 | * Modified by Neural Magic 4 | * Copyright (C) Marlin.2024 Elias Frantar 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | #pragma once 20 | 21 | constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; } 22 | 23 | // Instances of `Vec` are used to organize groups of >>registers<<, as needed 24 | // for instance as inputs to tensor core operations. Consequently, all 25 | // corresponding index accesses must be compile-time constants, which is why we 26 | // extensively use `#pragma unroll` throughout the kernel code to guarantee 27 | // this. 28 | template 29 | struct Vec { 30 | T elems[n]; 31 | __device__ T& operator[](int i) { return elems[i]; } 32 | }; 33 | -------------------------------------------------------------------------------- /infra/csrc/rocm/ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums, 6 | torch::Tensor& max_logits, torch::Tensor& tmp_out, 7 | torch::Tensor& query, torch::Tensor& key_cache, 8 | torch::Tensor& value_cache, int64_t num_kv_heads, 9 | double scale, torch::Tensor& block_tables, 10 | torch::Tensor& context_lens, int64_t block_size, 11 | int64_t max_context_len, 12 | const c10::optional& alibi_slopes, 13 | const std::string& kv_cache_dtype, double k_scale, 14 | double v_scale); 15 | -------------------------------------------------------------------------------- /infra/csrc/rocm/torch_bindings.cpp: -------------------------------------------------------------------------------- 1 | #include "core/registration.h" 2 | #include "rocm/ops.h" 3 | 4 | // Note on op signatures: 5 | // The X_meta signatures are for the meta functions corresponding to op X. 6 | // They must be kept in sync with the signature for X. Generally, only 7 | // functions that return Tensors require a meta function. 8 | // 9 | // See the following links for detailed docs on op registration and function 10 | // schemas. 11 | // https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9 12 | // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations 13 | 14 | TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) { 15 | // vLLM custom ops for rocm 16 | 17 | // Custom attention op 18 | // Compute the attention between an input query and the cached 19 | // keys/values using PagedAttention. 20 | rocm_ops.def( 21 | "paged_attention(Tensor! out, Tensor exp_sums," 22 | " Tensor max_logits, Tensor tmp_out," 23 | " Tensor query, Tensor key_cache," 24 | " Tensor value_cache, int num_kv_heads," 25 | " float scale, Tensor block_tables," 26 | " Tensor context_lens, int block_size," 27 | " int max_context_len," 28 | " Tensor? alibi_slopes," 29 | " str kv_cache_dtype," 30 | " float k_scale, float v_scale) -> ()"); 31 | rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention); 32 | } 33 | 34 | REGISTER_EXTENSION(TORCH_EXTENSION_NAME) 35 | -------------------------------------------------------------------------------- /infra/requirements-vllm-build.txt: -------------------------------------------------------------------------------- 1 | setuptools-scm>=8 2 | wheel 3 | cmake>=3.26 4 | ninja 5 | -------------------------------------------------------------------------------- /infra/requirements-vllm.txt: -------------------------------------------------------------------------------- 1 | transformers == 4.48.0 # Required for Llama 3.2. 2 | peft 3 | psutil 4 | typing_extensions >= 4.10 5 | msgspec 6 | pydantic >= 2.9 # Required for fastapi >= 0.113.0 7 | gguf == 0.10.0 8 | sentencepiece # Required for LLaMA tokenizer. 9 | mistral_common[opencv] >= 1.4.4 10 | py-cpuinfo 11 | aiohttp 12 | openai >= 1.40.0 # Ensure modern openai package (ensure types module present) 13 | uvicorn[standard] 14 | fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9' 15 | fastapi-utils 16 | typing-inspect 17 | pyzmq 18 | cloudpickle 19 | partial-json-parser # used for parsing partial JSON outputs 20 | prometheus_client >= 0.18.0 21 | prometheus-fastapi-instrumentator >= 7.0.0 22 | outlines >= 0.0.43, < 0.1 23 | einops # Required for Qwen2-VL. 24 | protobuf 25 | nvidia-ml-py # for pynvml package 26 | persist-queue 27 | -------------------------------------------------------------------------------- /infra/slurm_configs/cgroup.conf: -------------------------------------------------------------------------------- 1 | CgroupPlugin=cgroup/docker 2 | CgroupMountpoint=/tmp/cgroup 3 | ConstrainCores=yes 4 | ConstrainDevices=yes 5 | ConstrainRAMSpace=yes 6 | -------------------------------------------------------------------------------- /infra/slurm_configs/gres.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/slurm_configs/gres.conf -------------------------------------------------------------------------------- /infra/slurm_configs/munge.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/slurm_configs/munge.key -------------------------------------------------------------------------------- /infra/slurm_configs/slurm.conf: -------------------------------------------------------------------------------- 1 | SlurmctldHost=1d4b95352faa 2 | SlurmctldPort=6817 3 | SlurmdPort=6818 4 | SrunPortRange=7000-7200 5 | JobRequeue=0 6 | MpiDefault=none 7 | ProctrackType=proctrack/linuxproc 8 | ReturnToService=1 9 | SlurmctldPidFile=/var/run/slurmctld.pid 10 | SlurmdPidFile=/var/run/slurmd.pid 11 | SlurmdSpoolDir=/var/spool/slurmd 12 | SlurmUser=root 13 | SlurmdUser=root 14 | StateSaveLocation=/var/spool 15 | SwitchType=switch/none 16 | TaskPlugin=task/none 17 | InactiveLimit=0 18 | KillWait=30 19 | MinJobAge=300 20 | SlurmctldTimeout=120 21 | SlurmdTimeout=300 22 | Waittime=0 23 | SchedulerType=sched/backfill 24 | SelectType=select/cons_tres 25 | SelectTypeParameters=CR_Core 26 | AccountingStorageType=accounting_storage/none 27 | AccountingStoreFlags=job_comment 28 | ClusterName=supermas 29 | JobCompType=jobcomp/none 30 | JobAcctGatherFrequency=30 31 | JobAcctGatherType=jobacct_gather/none 32 | SlurmctldDebug=debug 33 | AuthType=auth/none 34 | CredType=cred/none 35 | SlurmctldLogFile=/var/log/slurm/slurmctld.log 36 | SlurmdDebug=debug 37 | SlurmdLogFile=/var/log/slurm/slurmd.log 38 | NodeName=1d4b95352faa CPUs=8 State=UNKNOWN 39 | PartitionName=short Nodes=1d4b95352faa Default=YES MaxTime=20 State=UP 40 | -------------------------------------------------------------------------------- /infra/slurm_configs/slurm.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/slurm_configs/slurm.key -------------------------------------------------------------------------------- /infra/slurm_src/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Safely execute this bash script 4 | # e exit on first failure 5 | # x all executed commands are printed to the terminal 6 | # u unset variables are errors 7 | # a export all variables to the environment 8 | # E any trap on ERR is inherited by shell functions 9 | # -o pipefail | produces a failure code if any stage fails 10 | set -Eeuoxa pipefail 11 | 12 | # Get the directory of this script 13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 14 | 15 | SOURCE_FILE=$LOCAL_DIRECTORY/cgroup_docker.c 16 | INCLUDE_PATH=/usr/include 17 | 18 | # Compile the cgroup_docker.c file into a shared object file 19 | gcc -I$INCLUDE_PATH -Wall -fPIC -shared -o $LOCAL_DIRECTORY/cgroup_docker.so $SOURCE_FILE 20 | 21 | # Determine if the target is an x86_64 or aarch64 machine 22 | if [ "$(uname -m)" == "x86_64" ]; then 23 | TARGET="x86_64-linux-gnu" 24 | elif [ "$(uname -m)" == "aarch64" ]; then 25 | TARGET="aarch64-linux-gnu" 26 | else 27 | echo "Unsupported architecture" 28 | exit 1 29 | fi 30 | 31 | # Copy the shared object file to the /usr/lib directory 32 | cp /app/cray/infra/slurm_src/cgroup_docker.so /usr/lib/$TARGET/slurm-wlm/cgroup_docker.so 33 | 34 | # Disable the plugin on the AMD target 35 | if [ $BASE_NAME == "amd" ]; then 36 | sed -i -e 's/CgroupPlugin=cgroup\/docker/CgroupPlugin=cgroup\/v1/g' /app/cray/infra/slurm_configs/cgroup.conf 37 | fi 38 | 39 | 40 | -------------------------------------------------------------------------------- /ml/cray_megatron/collectives/data_parallelism.py: -------------------------------------------------------------------------------- 1 | from gpu_aware_mpi import get_rank, get_size 2 | 3 | def get_data_parallel_rank(): 4 | return get_rank() 5 | 6 | 7 | def get_data_parallel_world_size(): 8 | return get_size() 9 | -------------------------------------------------------------------------------- /ml/cray_megatron/collectives/main_rank_only.py: -------------------------------------------------------------------------------- 1 | 2 | from gpu_aware_mpi import get_rank, barrier 3 | 4 | 5 | def is_main_rank(): 6 | return get_rank() == 0 7 | 8 | def main_rank_only(func): 9 | def wrap_function(*args, **kwargs): 10 | result = None 11 | barrier() 12 | if is_main_rank(): 13 | result = func(*args, **kwargs) 14 | barrier() 15 | return result 16 | 17 | return wrap_function 18 | -------------------------------------------------------------------------------- /ml/cray_megatron/huggingface/download_model.py: -------------------------------------------------------------------------------- 1 | from cray_megatron.collectives.main_rank_only import main_rank_only 2 | 3 | from huggingface_hub import snapshot_download 4 | 5 | 6 | @main_rank_only 7 | def download_model(model_name): 8 | snapshot_download(repo_id=model_name) 9 | -------------------------------------------------------------------------------- /ml/cray_megatron/megatron/dataset/data_loader.py: -------------------------------------------------------------------------------- 1 | from cray_megatron.megatron.dataset.load_dataset import load_dataset 2 | 3 | from cray_infra.util.get_job_config import get_job_config 4 | 5 | import torch 6 | 7 | 8 | class DataLoader: 9 | def __init__(self, model, tokenizer): 10 | 11 | self.model = model 12 | self.tokenizer = tokenizer 13 | self.batch_size = get_batch_size() 14 | self.epoch = 0 15 | 16 | self.dataset = load_dataset( 17 | model=self.model, 18 | tokenizer=self.tokenizer, 19 | epoch=self.epoch, 20 | ) 21 | 22 | self.loader = torch.utils.data.DataLoader( 23 | self.dataset, batch_size=self.batch_size 24 | ) 25 | 26 | def __iter__(self): 27 | self.iterator = iter(self.loader) 28 | return self 29 | 30 | def __next__(self): 31 | try: 32 | return next(self.iterator) 33 | except StopIteration: 34 | self.epoch += 1 35 | self.dataset = load_dataset( 36 | model=self.model, 37 | tokenizer=self.tokenizer, 38 | epoch=self.epoch, 39 | ) 40 | self.loader = torch.utils.data.DataLoader( 41 | self.dataset, batch_size=self.batch_size 42 | ) 43 | self.iterator = iter(self.loader) 44 | 45 | return next(self.iterator) 46 | 47 | 48 | def get_batch_size(): 49 | job_config = get_job_config() 50 | return job_config["batch_size"] 51 | -------------------------------------------------------------------------------- /ml/cray_megatron/megatron/distribution/apply_distribution_strategy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from cray_infra.training.distribution_strategy.fsdp.fsdp import SimpleFSDP 3 | 4 | from gpu_aware_mpi import get_size, get_rank 5 | 6 | def load_distribution_strategy(): 7 | device = get_device() 8 | 9 | strategy = { 10 | "device": device, 11 | } 12 | 13 | if get_size() > 1: 14 | strategy["strategy"] = SimpleFSDP 15 | 16 | return strategy 17 | 18 | 19 | def get_device(): 20 | if torch.cuda.is_available(): 21 | rank = get_rank() 22 | 23 | gpu_count = torch.cuda.device_count() 24 | 25 | selected_gpu = rank % gpu_count 26 | 27 | if gpu_count > 1: 28 | return torch.device(f"cuda:{selected_gpu}") 29 | 30 | return torch.cuda.current_device() 31 | else: 32 | return torch.device("cpu") 33 | 34 | 35 | def apply_distribution_strategy(model_info): 36 | distribution_strategy = load_distribution_strategy() 37 | model_info["distribution_strategy"] = distribution_strategy 38 | return model_info 39 | -------------------------------------------------------------------------------- /ml/cray_megatron/megatron/megatron_trainer.py: -------------------------------------------------------------------------------- 1 | from cray_infra.training.training_harness import TrainingHarness 2 | from cray_infra.training.training_job_status import TrainingJobStatus 3 | from cray_infra.training.print_logo import print_logo 4 | 5 | from cray_megatron.megatron.training_loop import TrainingLoop, get_max_steps 6 | 7 | import sys 8 | 9 | import logging 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class MegatronTrainer: 15 | def __init__(self, training_harness: TrainingHarness): 16 | self.training_harness = training_harness 17 | 18 | def train(self): 19 | self.train_loop() 20 | 21 | def train_loop(self): 22 | self.training_harness.update_status( 23 | status=TrainingJobStatus.TRAINING, metadata={"max_steps": get_max_steps()} 24 | ) 25 | 26 | print_logo() 27 | 28 | TrainingLoop(self.training_harness).train() 29 | 30 | self.training_harness.update_status(status=TrainingJobStatus.COMPLETED) 31 | -------------------------------------------------------------------------------- /ml/cray_megatron/models/does_any_checkpoint_exist.py: -------------------------------------------------------------------------------- 1 | from cray_megatron.models.get_latest_checkpoint_path import ( 2 | get_latest_checkpoint_path, 3 | ) 4 | 5 | 6 | def does_any_checkpoint_exist(): 7 | return get_latest_checkpoint_path() is not None 8 | -------------------------------------------------------------------------------- /ml/cray_megatron/models/get_model_manager.py: -------------------------------------------------------------------------------- 1 | 2 | from cray_infra.util.get_config import get_config 3 | 4 | from cray_megatron.models.tokenformer.tokenformer_model_manager import TokenformerModelManager 5 | 6 | def get_model_manager(): 7 | config = get_config() 8 | 9 | return TokenformerModelManager() 10 | 11 | 12 | -------------------------------------------------------------------------------- /ml/cray_megatron/models/model_manager_base.py: -------------------------------------------------------------------------------- 1 | 2 | from cray_megatron.models.get_latest_checkpoint_path import get_latest_checkpoint_path 3 | from cray_megatron.models.does_any_checkpoint_exist import does_any_checkpoint_exist 4 | 5 | from abc import ABC, abstractmethod 6 | 7 | class ModelManagerBase(ABC): 8 | @abstractmethod 9 | def load_model(self): 10 | pass 11 | 12 | def does_any_checkpoint_exist(self): 13 | return does_any_checkpoint_exist() 14 | 15 | def get_latest_checkpoint_path(self): 16 | return get_latest_checkpoint_path() 17 | 18 | -------------------------------------------------------------------------------- /ml/cray_megatron/models/tokenformer/tokenformer_model_manager.py: -------------------------------------------------------------------------------- 1 | 2 | from cray_megatron.models.model_manager_base import ModelManagerBase 3 | 4 | from cray_megatron.models.tokenformer.load_tokenformer_model import load_tokenformer_model 5 | 6 | class TokenformerModelManager(ModelManagerBase): 7 | def load_model(self): 8 | return load_tokenformer_model() 9 | 10 | 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy<2.0.0 2 | jsonlines 3 | aiofiles 4 | persist-queue 5 | matplotlib 6 | streaming-form-data==1.15.0 7 | mpi4py==4.0.3 8 | openmpi==0.0.0 9 | humanize 10 | -------------------------------------------------------------------------------- /scalarlm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Safely execute this bash script 4 | # e exit on first failure 5 | # x all executed commands are printed to the terminal 6 | # u unset variables are errors 7 | # a export all variables to the environment 8 | # E any trap on ERR is inherited by shell functions 9 | # -o pipefail | produces a failure code if any stage fails 10 | set -Eeuoxa pipefail 11 | 12 | # Get the directory of this script 13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 14 | 15 | # Refresh the bashly command 16 | $LOCAL_DIRECTORY/cmd/bashly.sh generate 17 | 18 | # Call the generated CLI script 19 | $LOCAL_DIRECTORY/scripts/scalarlm "$@" 20 | -------------------------------------------------------------------------------- /scripts/start_one_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Safely execute this bash script 4 | # e exit on first failure 5 | # x all executed commands are printed to the terminal 6 | # u unset variables are errors 7 | # a export all variables to the environment 8 | # E any trap on ERR is inherited by shell functions 9 | # -o pipefail | produces a failure code if any stage fails 10 | set -Eeuoxa pipefail 11 | 12 | # Get the directory of this script 13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 14 | 15 | $LOCAL_DIRECTORY/start_slurm.sh 16 | 17 | python -m cray_infra.one_server.main 18 | 19 | -------------------------------------------------------------------------------- /scripts/start_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Safely execute this bash script 4 | # e exit on first failure 5 | # x all executed commands are printed to the terminal 6 | # u unset variables are errors 7 | # a export all variables to the environment 8 | # E any trap on ERR is inherited by shell functions 9 | # -o pipefail | produces a failure code if any stage fails 10 | set -Eeuoxa pipefail 11 | 12 | # Get the directory of this script 13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 14 | 15 | # Run the slurm discovery service 16 | python $LOCAL_DIRECTORY/../infra/cray_infra/slurm/discovery/discover_clusters.py 17 | 18 | slurmctld 19 | slurmd 20 | 21 | -------------------------------------------------------------------------------- /scripts/train_job_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Safely execute this bash script 4 | # e exit on first failure 5 | # x all executed commands are printed to the terminal 6 | # u unset variables are errors 7 | # a export all variables to the environment 8 | # E any trap on ERR is inherited by shell functions 9 | # -o pipefail | produces a failure code if any stage fails 10 | set -Eeuoxa pipefail 11 | 12 | export CRAY_TRAINING_JOB_CONFIG_PATH=REPLACE_CONFIG_PATH 13 | 14 | # Get the directory of this script 15 | LOCAL_DIRECTORY="$( cd "$( dirname "${CRAY_TRAINING_JOB_CONFIG_PATH}" )" >/dev/null 2>&1 && pwd )" 16 | 17 | # Put the current ml directory in the python path so that the modules can be imported 18 | export PYTHONPATH=$LOCAL_DIRECTORY/ml:$PYTHONPATH 19 | 20 | mpirun --allow-run-as-root --oversubscribe python $LOCAL_DIRECTORY/ml/cray_megatron/main.py $* 21 | -------------------------------------------------------------------------------- /sdk/masint/__init__.py: -------------------------------------------------------------------------------- 1 | from masint.api.async_supermassive_intelligence import AsyncSupermassiveIntelligence 2 | from masint.api.supermassive_intelligence import SupermassiveIntelligence 3 | -------------------------------------------------------------------------------- /sdk/masint/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/sdk/masint/api/__init__.py -------------------------------------------------------------------------------- /sdk/masint/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/sdk/masint/cli/__init__.py -------------------------------------------------------------------------------- /sdk/masint/cli/squeue.py: -------------------------------------------------------------------------------- 1 | from masint.util.make_api_url import make_api_url 2 | 3 | import aiohttp 4 | import asyncio 5 | 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def squeue(): 12 | logger.info(f"Getting squeue") 13 | 14 | try: 15 | asyncio.run(squeue_async()) 16 | except Exception as e: 17 | logger.error(f"Failed to get squeue output") 18 | logger.error(e) 19 | 20 | 21 | async def squeue_async(): 22 | async with aiohttp.ClientSession() as session: 23 | async with session.get(make_api_url(f"v1/megatron/squeue")) as resp: 24 | data = await resp.json() 25 | 26 | logger.info(f"Got response for squeue") 27 | logger.info(data) 28 | 29 | if resp.status != 200: 30 | logger.error(f"Failed to get squeue") 31 | logger.error(data) 32 | raise Exception("Failed to get squeue") 33 | 34 | print(data["squeue_output"]) 35 | 36 | -------------------------------------------------------------------------------- /sdk/masint/engines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/sdk/masint/engines/__init__.py -------------------------------------------------------------------------------- /sdk/masint/engines/cray/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/sdk/masint/engines/cray/__init__.py -------------------------------------------------------------------------------- /sdk/masint/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/sdk/masint/util/__init__.py -------------------------------------------------------------------------------- /sdk/masint/util/get_api_base.py: -------------------------------------------------------------------------------- 1 | import masint 2 | import scalarlm 3 | 4 | import os 5 | 6 | def get_api_base(): 7 | if hasattr(scalarlm, "api_url") and scalarlm.api_url is not None: 8 | return scalarlm.api_url 9 | 10 | if hasattr(masint, "api_url") and masint.api_url is not None: 11 | return masint.api_url 12 | 13 | if "SCALARLM_API_URL" in os.environ: 14 | return os.environ["SCALARLM_API_URL"] 15 | 16 | if "MASINT_API_URL" in os.environ: 17 | return os.environ["MASINT_API_URL"] 18 | 19 | return "http://localhost:8000" 20 | 21 | -------------------------------------------------------------------------------- /sdk/masint/util/make_api_url.py: -------------------------------------------------------------------------------- 1 | from masint.util.get_api_base import get_api_base 2 | 3 | 4 | def make_api_url(endpoint, api_url=None): 5 | if api_url is not None: 6 | api_base = api_url 7 | else: 8 | api_base = get_api_base() 9 | return f"{api_base}/{endpoint}" 10 | -------------------------------------------------------------------------------- /sdk/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=65.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "scalarlm" 7 | version = "0.81" 8 | authors = [ 9 | { name="Greg Diamos", email="gregory.diamos@gmail.com" }, 10 | ] 11 | description = "ScalarLM is a unified LLM inference and training platform" 12 | readme = "README.md" 13 | requires-python = ">=3.7" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: Apache Software License", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "aiohttp", 21 | "aiofiles", 22 | "jsonlines", 23 | "matplotlib", 24 | "humanize", 25 | ] 26 | 27 | [tool.setuptools] 28 | packages = [ 29 | "scalarlm", 30 | "masint", 31 | "masint.api", 32 | "masint.cli", 33 | "masint.util", 34 | "masint.engines", 35 | "masint.engines.cray", 36 | ] 37 | 38 | [project.scripts] 39 | scalarlm = "masint.cli.main:main" 40 | 41 | [tool.autopep8] 42 | max_line_length = 120 43 | in-place = true 44 | recursive = true 45 | aggressive = 2 46 | 47 | [project.urls] 48 | Homepage = "https://github.com/tensorwavecloud/scalarlm" 49 | 50 | -------------------------------------------------------------------------------- /sdk/scalarlm/__init__.py: -------------------------------------------------------------------------------- 1 | from masint import * 2 | -------------------------------------------------------------------------------- /test/benchmark/main.py: -------------------------------------------------------------------------------- 1 | from benchmark.pytorch.memcpy import benchmark_memcpy 2 | from benchmark.pytorch.memcpy_peer import benchmark_memcpy_peer 3 | from benchmark.pytorch.gemm import benchmark_gemm 4 | from benchmark.pytorch.forward import benchmark_forward 5 | from benchmark.pytorch.backward import benchmark_backward 6 | 7 | from benchmark.roofline.plot_roofline import plot_roofline 8 | from benchmark.roofline.plot_bandwidth_sweep import plot_bandwidth_sweep 9 | 10 | import os 11 | 12 | import logging 13 | 14 | def main(): 15 | setup_logging() 16 | 17 | os.environ["HUGGING_FACE_HUB_TOKEN"] = "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" 18 | 19 | benchmark_memcpy() 20 | benchmark_memcpy_peer() 21 | benchmark_gemm() 22 | benchmark_forward() 23 | benchmark_backward() 24 | 25 | plot_roofline() 26 | plot_bandwidth_sweep() 27 | 28 | 29 | def setup_logging(): 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | main() 33 | -------------------------------------------------------------------------------- /test/deployment/embed.py: -------------------------------------------------------------------------------- 1 | import scalarlm 2 | 3 | 4 | scalarlm.api_url = "http://localhost:8000" 5 | #scalarlm.api_url = "https://meta-llama--llama-3-2-3b-instruct.cray-lm.com" 6 | #scalarlm.api_url = "https://greg1232--cray-cpu-llama-3-2-1b-instruct-fastapi-app.modal.run" 7 | #scalarlm.api_url = "https://greg1232--cray-nvidia-llama-3-2-3b-instruct-fastapi-app.modal.run" 8 | 9 | 10 | def get_dataset(count): 11 | dataset = [] 12 | 13 | for i in range(count): 14 | dataset.append(f"What is {i} + {i}?") 15 | 16 | return dataset 17 | 18 | 19 | llm = scalarlm.SupermassiveIntelligence() 20 | 21 | dataset = get_dataset(count=3) 22 | 23 | results = llm.embed(prompts=dataset, 24 | # generate with default model 25 | # model_name="c7c3ed39e0005e0e73145d49510c94d7b5e4f6552cd35c4a7a8b37d0b41f318e" 26 | ) 27 | 28 | print(results) 29 | 30 | -------------------------------------------------------------------------------- /test/deployment/generate.py: -------------------------------------------------------------------------------- 1 | import scalarlm 2 | 3 | scalarlm.api_url = "http://localhost:8000" 4 | 5 | def get_dataset(count): 6 | dataset = [] 7 | 8 | for i in range(count): 9 | dataset.append(f"What is {i} + {i}?") 10 | 11 | return dataset 12 | 13 | 14 | llm = scalarlm.SupermassiveIntelligence() 15 | 16 | dataset = get_dataset(count=1) 17 | 18 | results = llm.generate( 19 | prompts=dataset, 20 | max_tokens=200, 21 | ) 22 | 23 | print(results) 24 | -------------------------------------------------------------------------------- /test/deployment/health.py: -------------------------------------------------------------------------------- 1 | import scalarlm 2 | 3 | scalarlm.api_url = "http://localhost:8000" 4 | 5 | llm = scalarlm.SupermassiveIntelligence() 6 | 7 | results = llm.health() 8 | 9 | print(results) 10 | -------------------------------------------------------------------------------- /test/deployment/train.py: -------------------------------------------------------------------------------- 1 | import scalarlm 2 | 3 | scalarlm.api_url = "http://localhost:8000" 4 | 5 | def get_dataset(): 6 | dataset = [] 7 | 8 | count = 1 9 | 10 | for i in range(count): 11 | dataset.append({"input": f"What is {i} + {i}?", "output": str(i + i)}) 12 | 13 | return dataset * 100 14 | 15 | 16 | llm = scalarlm.SupermassiveIntelligence(api_url=scalarlm.api_url) 17 | 18 | dataset = get_dataset() 19 | 20 | status = llm.train( 21 | dataset, 22 | train_args={"max_steps": 100, "learning_rate": 1e-4, "gpus": 2, 23 | "max_token_block_size": 4096, 24 | "steps_per_checkpoint": 10000}, 25 | ) 26 | 27 | print(status) 28 | -------------------------------------------------------------------------------- /test/infra/generate.py: -------------------------------------------------------------------------------- 1 | from cray_infra.one_server.start_cray_server import start_cray_server 2 | from cray_infra.one_server.wait_for_vllm import wait_for_vllm 3 | 4 | import masint 5 | 6 | import unittest 7 | 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class TestGenerate(unittest.IsolatedAsyncioTestCase): 14 | async def asyncSetUp(self): 15 | 16 | logger.info("Starting server") 17 | 18 | self.app = await start_cray_server(server_list=["api", "vllm"]) 19 | 20 | logger.debug(f"Server started: {self.app}") 21 | 22 | async def test_generate_single(self): 23 | logger.debug("Testing generate single") 24 | 25 | await wait_for_vllm() 26 | 27 | llm = masint.AsyncSupermassiveIntelligence() 28 | 29 | result = await llm.generate(prompts=["What is 1 + 1?"]) 30 | 31 | logger.debug(f"Result: {result}") 32 | 33 | async def test_generate_batch(self): 34 | logger.debug("Testing generate batch") 35 | 36 | await wait_for_vllm() 37 | 38 | llm = masint.AsyncSupermassiveIntelligence() 39 | 40 | prompts = [ 41 | "What is 1 + 1?", 42 | "What is 2 + 2?", 43 | "What is 3 + 3?", 44 | "What is 4 + 4?", 45 | ] 46 | 47 | result = await llm.generate(prompts=prompts) 48 | 49 | logger.debug(f"Result: {result}") 50 | 51 | async def asyncTearDown(self): 52 | logger.debug("Shutting down server") 53 | await self.app.shutdown() 54 | -------------------------------------------------------------------------------- /test/infra/get_results.py: -------------------------------------------------------------------------------- 1 | from cray_infra.one_server.start_cray_server import start_cray_server 2 | from cray_infra.one_server.wait_for_vllm import wait_for_vllm 3 | 4 | import masint 5 | 6 | import unittest 7 | 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class TestGetResults(unittest.IsolatedAsyncioTestCase): 14 | async def asyncSetUp(self): 15 | 16 | logger.info("Starting server") 17 | 18 | self.app = await start_cray_server(server_list=["api", "vllm"]) 19 | 20 | logger.debug(f"Server started: {self.app}") 21 | 22 | async def test_generate_batch(self): 23 | logger.debug("Testing generate batch") 24 | 25 | await wait_for_vllm() 26 | 27 | llm = masint.AsyncSupermassiveIntelligence() 28 | 29 | prompts = ["What is 1 + 1?", "What is 2 + 2?", "What is 3 + 3?", "What is 4 + 4?"] 30 | 31 | results = await llm.submit_generate(prompts=prompts) 32 | 33 | logger.debug(f"Results: {results}") 34 | 35 | ids = [r["request_id"] for r in results["results"]] 36 | 37 | logger.debug(f"IDs: {ids}") 38 | 39 | new_results = await llm.get_results(ids) 40 | 41 | logger.debug(f"New Results: {new_results}") 42 | 43 | for r in new_results["results"]: 44 | self.assertTrue(r["request_id"] in ids) 45 | 46 | 47 | async def asyncTearDown(self): 48 | logger.debug("Shutting down server") 49 | await self.app.shutdown() 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /test/infra/health.py: -------------------------------------------------------------------------------- 1 | from cray_infra.one_server.start_cray_server import start_cray_server 2 | from cray_infra.util.get_config import get_config 3 | 4 | import masint 5 | 6 | import aiohttp 7 | import unittest 8 | import pytest 9 | 10 | import logging 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | logging.basicConfig(level=logging.DEBUG) 15 | 16 | 17 | class TestHealth(unittest.IsolatedAsyncioTestCase): 18 | async def asyncSetUp(self): 19 | 20 | logger.info("Starting server") 21 | 22 | self.app = await start_cray_server(server_list=["api"]) 23 | 24 | logger.debug(f"Server started: {self.app}") 25 | 26 | async def test_health(self): 27 | logger.debug("Testing health endpoint") 28 | health_status = await get_health() 29 | 30 | self.assertEqual(health_status["api"], "up") 31 | 32 | async def test_health_client(self): 33 | logger.debug("Testing health endpoint with client") 34 | 35 | llm = masint.AsyncSupermassiveIntelligence() 36 | 37 | status = await llm.health() 38 | 39 | self.assertEqual(status["api"], "up") 40 | 41 | async def asyncTearDown(self): 42 | logger.debug("Shutting down server") 43 | await self.app.shutdown() 44 | 45 | 46 | async def get_health(): 47 | config = get_config() 48 | 49 | async with aiohttp.ClientSession() as session: 50 | async with session.get(config["api_url"] + "/v1/health") as response: 51 | return await response.json() 52 | -------------------------------------------------------------------------------- /test/infra/openai_client.py: -------------------------------------------------------------------------------- 1 | from cray_infra.util.get_config import get_config 2 | 3 | from cray_infra.one_server.start_cray_server import start_cray_server 4 | from cray_infra.one_server.wait_for_vllm import wait_for_vllm 5 | 6 | from openai import AsyncOpenAI 7 | 8 | import unittest 9 | import asyncio 10 | 11 | import logging 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class TestOpenAIClient(unittest.IsolatedAsyncioTestCase): 17 | async def asyncSetUp(self): 18 | 19 | logger.info("Starting server") 20 | 21 | self.app = await start_cray_server(server_list=["api", "vllm"]) 22 | 23 | logger.debug(f"Server started: {self.app}") 24 | 25 | async def test_openai_client(self): 26 | logger.debug("Testing openai client") 27 | 28 | await wait_for_vllm() 29 | 30 | config = get_config() 31 | 32 | client = AsyncOpenAI( 33 | base_url=config["api_url"] + "/v1/openai", 34 | api_key="token-abc123", 35 | ) 36 | 37 | completion = await client.chat.completions.create( 38 | model=config["model"], 39 | messages=[{"role": "user", "content": "Hello!"}], 40 | max_tokens=10, 41 | ) 42 | 43 | print(completion.choices[0].message) 44 | 45 | async def asyncTearDown(self): 46 | logger.debug("Shutting down server") 47 | await self.app.shutdown() 48 | -------------------------------------------------------------------------------- /test/infra/sanity.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | class TestSanity(unittest.TestCase): 5 | def test_sanity(self): 6 | print("Sanity test") 7 | self.assertTrue(True) 8 | -------------------------------------------------------------------------------- /test/infra/slurm.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import subprocess 3 | import unittest 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class TestSlurm(unittest.TestCase): 9 | def test_srun(self): 10 | run_command = [ 11 | "srun", 12 | "hostname", 13 | ] 14 | result = subprocess.run(run_command, stdout=subprocess.PIPE) 15 | 16 | logger.debug(f"result: {result}") 17 | 18 | self.assertTrue(result.returncode == 0) 19 | -------------------------------------------------------------------------------- /test/infra/upload_dataset.py: -------------------------------------------------------------------------------- 1 | from cray_infra.one_server.start_cray_server import start_cray_server 2 | 3 | import masint 4 | 5 | import unittest 6 | 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class TestUploadDataset(unittest.IsolatedAsyncioTestCase): 13 | async def asyncSetUp(self): 14 | 15 | logger.info("Starting server") 16 | 17 | self.app = await start_cray_server(server_list=["api"]) 18 | 19 | logger.debug(f"Server started: {self.app}") 20 | 21 | async def test_upload_dataset(self): 22 | logger.debug("Testing upload ability of train endpoint") 23 | 24 | llm = masint.AsyncSupermassiveIntelligence() 25 | 26 | dataset = get_dataset() 27 | 28 | status = await llm.train(dataset, train_args={"max_steps": 1}) 29 | 30 | async def asyncTearDown(self): 31 | logger.debug("Shutting down server") 32 | await self.app.shutdown() 33 | 34 | 35 | def get_dataset(): 36 | dataset = [] 37 | 38 | count = 10000 39 | 40 | for i in range(count): 41 | dataset.append( 42 | {"input": f"What is {i} + {i}", "output": "The answer is " + str(i + i)} 43 | ) 44 | 45 | return dataset 46 | -------------------------------------------------------------------------------- /test/infra/vllm_health.py: -------------------------------------------------------------------------------- 1 | from cray_infra.one_server.start_cray_server import start_cray_server 2 | from cray_infra.one_server.wait_for_vllm import get_vllm_health, wait_for_vllm 3 | 4 | import unittest 5 | 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class TestVLLMHealth(unittest.IsolatedAsyncioTestCase): 12 | async def asyncSetUp(self): 13 | 14 | logger.info("Starting server") 15 | 16 | self.app = await start_cray_server(server_list=["vllm"]) 17 | 18 | logger.debug(f"Server started: {self.app}") 19 | 20 | async def test_vllm_health(self): 21 | logger.debug("Testing health endpoint") 22 | 23 | await wait_for_vllm() 24 | 25 | health_status = await get_vllm_health() 26 | 27 | self.assertEqual(health_status, 200) 28 | 29 | async def asyncTearDown(self): 30 | logger.debug("Shutting down server") 31 | await self.app.shutdown() 32 | -------------------------------------------------------------------------------- /test/ml/rl/cs_semester.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/test/ml/rl/cs_semester.sqlite -------------------------------------------------------------------------------- /test/ml/tokenformer/test_llama_tokenformer_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from transformers import LlamaForCausalLM, LlamaTokenizer 4 | 5 | from ml.tokenformer.llama_tokenformer_model import create_llama_tokenformer_model 6 | 7 | @pytest.fixture 8 | def model_setup(): 9 | model_name = "masint/tiny-random-llama" 10 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 11 | tokenizer = LlamaTokenizer.from_pretrained(model_name) 12 | model = LlamaForCausalLM.from_pretrained(model_name).to(device) 13 | return model 14 | 15 | def test_create_llama_tokenformer_model(model_setup): 16 | model = model_setup 17 | # lm_head is trained by default 18 | result = create_llama_tokenformer_model(model, "cpu") 19 | 20 | # Check requires_grad is set correctly 21 | for name, param in result.named_parameters(): 22 | if any(module_name in name for module_name in ["tokenformer", "lm_head"]): 23 | assert param.requires_grad 24 | else: 25 | assert not param.requires_grad 26 | 27 | def test_create_llama_tokenformer_model_no_lm_head(model_setup): 28 | model = model_setup 29 | # lm_head should not be trained 30 | result = create_llama_tokenformer_model(model=model, device="cpu", train_lm_head=False) 31 | 32 | # Check requires_grad is set correctly 33 | for name, param in result.named_parameters(): 34 | if any(module_name in name for module_name in ["tokenformer"]): 35 | assert param.requires_grad 36 | else: 37 | assert not param.requires_grad 38 | -------------------------------------------------------------------------------- /test/requirements-pytest.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-cov 3 | pytest-xdist 4 | pytest-forked 5 | pytest-asyncio 6 | pytest-dotenv 7 | pytest-mock 8 | pytest-rerunfailures 9 | pytest-timeout 10 | codecov 11 | --------------------------------------------------------------------------------