├── .github
    └── workflows
    │   ├── depot-amd.yml
    │   ├── depot-cpu.yml
    │   ├── depot-nvidia-8.0.yml
    │   ├── depot-nvidia-8.6.yml
    │   ├── depot-nvidia.yml
    │   └── unit-tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── Faq.md
├── LICENSE
├── README.md
├── cmd
    ├── bashly-settings.yml
    ├── bashly.sh
    ├── bashly.yml
    ├── benchmark_command.sh
    ├── build_image_command.sh
    ├── depot_build_command.sh
    ├── lib
    │   └── colors.sh
    ├── llm_logs_command.sh
    ├── llm_ls_command.sh
    ├── llm_plot_command.sh
    ├── llm_squeue_command.sh
    ├── pypi_command.sh
    ├── test_command.sh
    └── up_command.sh
├── deployment
    ├── ansible
    │   ├── hosts
    │   └── k8.yaml
    └── helm
    │   ├── amd_multi_node
    │       └── scalarlm
    │       │   ├── Chart.yaml
    │       │   ├── templates
    │       │       ├── _helpers.tpl
    │       │       ├── api_configmap.yaml
    │       │       ├── api_deployment.yaml
    │       │       ├── api_service.yaml
    │       │       ├── vllm_configmap.yaml
    │       │       ├── vllm_deployment.yaml
    │       │       └── vllm_service.yaml
    │       │   └── values.yaml
    │   ├── amd_single_node
    │       └── scalarlm
    │       │   ├── Chart.yaml
    │       │   ├── local-hostpath-sc.yaml
    │       │   ├── storageclass-clusterrole.yaml
    │       │   ├── templates
    │       │       ├── _helpers.tpl
    │       │       ├── api_configmap.yaml
    │       │       ├── api_deployment.yaml
    │       │       ├── api_service.yaml
    │       │       ├── cache_pvc.yaml
    │       │       ├── jobs_pvc.yaml
    │       │       ├── vllm_configmap.yaml
    │       │       ├── vllm_deployment.yaml
    │       │       └── vllm_service.yaml
    │       │   └── values.yaml
    │   ├── amd_single_pod
    │       └── scalarlm
    │       │   ├── Chart.yaml
    │       │   ├── templates
    │       │       ├── _helpers.tpl
    │       │       ├── configmap.yaml
    │       │       ├── deployment.yaml
    │       │       └── service.yaml
    │       │   └── values.yaml
    │   ├── cray
    │       ├── .helmignore
    │       ├── Chart.yaml
    │       ├── templates
    │       │   ├── NOTES.txt
    │       │   ├── _helpers.tpl
    │       │   ├── deployment.yaml
    │       │   ├── hpa.yaml
    │       │   ├── ingress.yaml
    │       │   ├── service.yaml
    │       │   ├── serviceaccount.yaml
    │       │   └── tests
    │       │   │   └── test-connection.yaml
    │       └── values.yaml
    │   ├── lambda
    │       └── cray
    │       │   ├── Chart.yaml
    │       │   ├── templates
    │       │       ├── _helpers.tpl
    │       │       ├── configmap.yaml
    │       │       ├── deployment.yaml
    │       │       └── service.yaml
    │       │   └── values.yaml
    │   ├── minikube
    │       └── cray
    │       │   ├── Chart.yaml
    │       │   ├── templates
    │       │       ├── _helpers.tpl
    │       │       ├── deployment.yaml
    │       │       └── service.yaml
    │       │   └── values.yaml
    │   ├── tensorwave3b
    │       └── scalarlm
    │       │   ├── Chart.yaml
    │       │   ├── templates
    │       │       ├── _helpers.tpl
    │       │       ├── api_configmap.yaml
    │       │       ├── api_deployment.yaml
    │       │       ├── api_service.yaml
    │       │       ├── cache_pvc.yaml
    │       │       ├── jobs_pvc.yaml
    │       │       ├── vllm_configmap.yaml
    │       │       ├── vllm_deployment.yaml
    │       │       └── vllm_service.yaml
    │       │   └── values.yaml
    │   ├── tensorwave70b
    │       └── scalarlm
    │       │   ├── Chart.yaml
    │       │   ├── templates
    │       │       ├── _helpers.tpl
    │       │       ├── api_configmap.yaml
    │       │       ├── api_deployment.yaml
    │       │       ├── api_service.yaml
    │       │       ├── cache_pvc.yaml
    │       │       ├── jobs_pvc.yaml
    │       │       ├── vllm_configmap.yaml
    │       │       ├── vllm_deployment.yaml
    │       │       └── vllm_service.yaml
    │       │   └── values.yaml
    │   └── tensorwave8b
    │       └── scalarlm
    │           ├── Chart.yaml
    │           ├── templates
    │               ├── _helpers.tpl
    │               ├── api_configmap.yaml
    │               ├── api_deployment.yaml
    │               ├── api_service.yaml
    │               ├── cache_pvc.yaml
    │               ├── jobs_pvc.yaml
    │               ├── vllm_configmap.yaml
    │               ├── vllm_deployment.yaml
    │               └── vllm_service.yaml
    │           └── values.yaml
├── docker-compose.yaml
├── docs
    ├── cray-docs
    │   ├── docs
    │   │   ├── arch.md
    │   │   ├── assets
    │   │   │   ├── cray-arch.png
    │   │   │   ├── cray.jpeg
    │   │   │   └── loss_plot_044db4ac60.png
    │   │   ├── cli
    │   │   │   ├── cli.md
    │   │   │   ├── list-models.md
    │   │   │   ├── plot.md
    │   │   │   ├── squeue.md
    │   │   │   └── training-logs.md
    │   │   ├── contact.md
    │   │   ├── deployment
    │   │   │   ├── docker.md
    │   │   │   ├── kubernetes.md
    │   │   │   ├── laptop.md
    │   │   │   ├── modal-details.md
    │   │   │   └── modal.md
    │   │   ├── index.md
    │   │   ├── inference.md
    │   │   ├── quickstart.md
    │   │   └── training.md
    │   └── mkdocs.yml
    └── deploy.sh
├── frontend
    └── assets
    │   └── logo.svg
├── infra
    ├── CMakeLists.txt
    ├── cmake
    │   ├── cpu_extension.cmake
    │   ├── hipify.py
    │   └── utils.cmake
    ├── cray_infra
    │   ├── api
    │   │   ├── fastapi
    │   │   │   ├── aiohttp
    │   │   │   │   └── get_global_session.py
    │   │   │   ├── generate
    │   │   │   │   ├── embed.py
    │   │   │   │   ├── finish_work.py
    │   │   │   │   ├── generate.py
    │   │   │   │   ├── get_results.py
    │   │   │   │   ├── get_work.py
    │   │   │   │   └── poll_for_responses.py
    │   │   │   ├── health
    │   │   │   │   └── check_health.py
    │   │   │   ├── main.py
    │   │   │   ├── routers
    │   │   │   │   ├── generate_router.py
    │   │   │   │   ├── health_router.py
    │   │   │   │   ├── megatron_router.py
    │   │   │   │   ├── openai_router.py
    │   │   │   │   └── request_types
    │   │   │   │   │   ├── embed_request.py
    │   │   │   │   │   ├── finish_work_request.py
    │   │   │   │   │   ├── generate_request.py
    │   │   │   │   │   ├── generate_response.py
    │   │   │   │   │   ├── get_results_request.py
    │   │   │   │   │   ├── get_results_response.py
    │   │   │   │   │   ├── get_work_request.py
    │   │   │   │   │   ├── get_work_response.py
    │   │   │   │   │   ├── list_models_response.py
    │   │   │   │   │   ├── squeue_response.py
    │   │   │   │   │   └── train_request.py
    │   │   │   └── tasks
    │   │   │   │   └── add_megatron_tasks.py
    │   │   └── work_queue
    │   │   │   └── inference_work_queue.py
    │   ├── generate
    │   │   └── clear_acked_requests_from_queue.py
    │   ├── one_server
    │   │   ├── create_api.py
    │   │   ├── create_vllm.py
    │   │   ├── main.py
    │   │   ├── start_cray_server.py
    │   │   └── wait_for_vllm.py
    │   ├── slurm
    │   │   └── discovery
    │   │   │   └── discover_clusters.py
    │   ├── training
    │   │   ├── distribution_strategy
    │   │   │   └── fsdp
    │   │   │   │   └── fsdp.py
    │   │   ├── get_latest_model.py
    │   │   ├── get_training_job_info.py
    │   │   ├── gpu_aware_mpi
    │   │   │   ├── gpu_aware_mpi.cpp
    │   │   │   └── setup.py
    │   │   ├── launch_training_job.py
    │   │   ├── list_models.py
    │   │   ├── metrics.py
    │   │   ├── print_logo.py
    │   │   ├── register_megatron_models.py
    │   │   ├── restart_megatron_jobs.py
    │   │   ├── squeue.py
    │   │   ├── training_harness.py
    │   │   ├── training_job_status.py
    │   │   ├── training_logs_generator.py
    │   │   ├── upload_training_data.py
    │   │   └── vllm_model_manager.py
    │   ├── util
    │   │   ├── default_config.py
    │   │   ├── default_job_config.py
    │   │   ├── get_config.py
    │   │   └── get_job_config.py
    │   └── vllm
    │   │   ├── __init__.py
    │   │   ├── _core_ext.py
    │   │   ├── _custom_ops.py
    │   │   ├── _ipex_ops.py
    │   │   ├── _version.py
    │   │   ├── adapter_commons
    │   │       ├── __init__.py
    │   │       ├── layers.py
    │   │       ├── models.py
    │   │       ├── request.py
    │   │       ├── utils.py
    │   │       └── worker_manager.py
    │   │   ├── assets
    │   │       ├── __init__.py
    │   │       ├── audio.py
    │   │       ├── base.py
    │   │       ├── image.py
    │   │       └── video.py
    │   │   ├── attention
    │   │       ├── __init__.py
    │   │       ├── backends
    │   │       │   ├── __init__.py
    │   │       │   ├── abstract.py
    │   │       │   ├── blocksparse_attn.py
    │   │       │   ├── flash_attn.py
    │   │       │   ├── flashinfer.py
    │   │       │   ├── ipex_attn.py
    │   │       │   ├── openvino.py
    │   │       │   ├── pallas.py
    │   │       │   ├── rocm_flash_attn.py
    │   │       │   ├── torch_sdpa.py
    │   │       │   ├── utils.py
    │   │       │   └── xformers.py
    │   │       ├── layer.py
    │   │       ├── ops
    │   │       │   ├── __init__.py
    │   │       │   ├── blocksparse_attention
    │   │       │   │   ├── __init__.py
    │   │       │   │   ├── blocksparse_attention_kernel.py
    │   │       │   │   ├── interface.py
    │   │       │   │   └── utils.py
    │   │       │   ├── ipex_attn.py
    │   │       │   ├── paged_attn.py
    │   │       │   ├── prefix_prefill.py
    │   │       │   └── triton_flash_attention.py
    │   │       └── selector.py
    │   │   ├── beam_search.py
    │   │   ├── block.py
    │   │   ├── compilation
    │   │       ├── __init__.py
    │   │       ├── backends.py
    │   │       └── wrapper.py
    │   │   ├── config.py
    │   │   ├── connections.py
    │   │   ├── core
    │   │       ├── __init__.py
    │   │       ├── block
    │   │       │   ├── __init__.py
    │   │       │   ├── block_table.py
    │   │       │   ├── common.py
    │   │       │   ├── cpu_gpu_block_allocator.py
    │   │       │   ├── interfaces.py
    │   │       │   ├── naive_block.py
    │   │       │   ├── prefix_caching_block.py
    │   │       │   └── utils.py
    │   │       ├── block_manager_v1.py
    │   │       ├── block_manager_v2.py
    │   │       ├── embedding_model_block_manager.py
    │   │       ├── evictor_v1.py
    │   │       ├── evictor_v2.py
    │   │       ├── interfaces.py
    │   │       └── scheduler.py
    │   │   ├── distributed
    │   │       ├── __init__.py
    │   │       ├── communication_op.py
    │   │       ├── device_communicators
    │   │       │   ├── __init__.py
    │   │       │   ├── cuda_wrapper.py
    │   │       │   ├── custom_all_reduce.py
    │   │       │   ├── custom_all_reduce_utils.py
    │   │       │   ├── pynccl.py
    │   │       │   ├── pynccl_wrapper.py
    │   │       │   ├── shm_broadcast.py
    │   │       │   └── tpu_communicator.py
    │   │       ├── parallel_state.py
    │   │       └── utils.py
    │   │   ├── engine
    │   │       ├── __init__.py
    │   │       ├── arg_utils.py
    │   │       ├── async_llm_engine.py
    │   │       ├── async_timeout.py
    │   │       ├── llm_engine.py
    │   │       ├── metrics.py
    │   │       ├── metrics_types.py
    │   │       ├── multiprocessing
    │   │       │   ├── __init__.py
    │   │       │   ├── client.py
    │   │       │   └── engine.py
    │   │       ├── output_processor
    │   │       │   ├── __init__.py
    │   │       │   ├── interfaces.py
    │   │       │   ├── multi_step.py
    │   │       │   ├── single_step.py
    │   │       │   ├── stop_checker.py
    │   │       │   └── util.py
    │   │       └── protocol.py
    │   │   ├── entrypoints
    │   │       ├── __init__.py
    │   │       ├── api_server.py
    │   │       ├── chat_utils.py
    │   │       ├── launcher.py
    │   │       ├── llm.py
    │   │       ├── logger.py
    │   │       └── openai
    │   │       │   ├── __init__.py
    │   │       │   ├── api_server.py
    │   │       │   ├── cli_args.py
    │   │       │   ├── logits_processors.py
    │   │       │   ├── protocol.py
    │   │       │   ├── run_batch.py
    │   │       │   ├── serving_chat.py
    │   │       │   ├── serving_completion.py
    │   │       │   ├── serving_embedding.py
    │   │       │   ├── serving_engine.py
    │   │       │   ├── serving_tokenization.py
    │   │       │   └── tool_parsers
    │   │       │       ├── __init__.py
    │   │       │       ├── abstract_tool_parser.py
    │   │       │       ├── hermes_tool_parser.py
    │   │       │       ├── internlm2_tool_parser.py
    │   │       │       ├── llama_tool_parser.py
    │   │       │       ├── mistral_tool_parser.py
    │   │       │       └── utils.py
    │   │   ├── envs.py
    │   │   ├── executor
    │   │       ├── __init__.py
    │   │       ├── cpu_executor.py
    │   │       ├── distributed_gpu_executor.py
    │   │       ├── executor_base.py
    │   │       ├── gpu_executor.py
    │   │       ├── msgspec_utils.py
    │   │       ├── multiproc_gpu_executor.py
    │   │       ├── multiproc_worker_utils.py
    │   │       ├── multiproc_xpu_executor.py
    │   │       ├── neuron_executor.py
    │   │       ├── openvino_executor.py
    │   │       ├── ray_gpu_executor.py
    │   │       ├── ray_tpu_executor.py
    │   │       ├── ray_utils.py
    │   │       ├── ray_xpu_executor.py
    │   │       ├── tpu_executor.py
    │   │       └── xpu_executor.py
    │   │   ├── forward_context.py
    │   │   ├── inputs
    │   │       ├── __init__.py
    │   │       ├── data.py
    │   │       ├── parse.py
    │   │       ├── preprocess.py
    │   │       └── registry.py
    │   │   ├── logger.py
    │   │   ├── logging
    │   │       ├── __init__.py
    │   │       └── formatter.py
    │   │   ├── lora
    │   │       ├── __init__.py
    │   │       ├── fully_sharded_layers.py
    │   │       ├── layers.py
    │   │       ├── lora.py
    │   │       ├── models.py
    │   │       ├── ops
    │   │       │   ├── __init__.py
    │   │       │   ├── bgmv_expand.py
    │   │       │   ├── bgmv_expand_slice.py
    │   │       │   ├── bgmv_shrink.py
    │   │       │   ├── sgmv_expand.py
    │   │       │   ├── sgmv_expand_slice.py
    │   │       │   ├── sgmv_shrink.py
    │   │       │   └── utils.py
    │   │       ├── punica.py
    │   │       ├── request.py
    │   │       ├── utils.py
    │   │       └── worker_manager.py
    │   │   ├── model_executor
    │   │       ├── __init__.py
    │   │       ├── custom_op.py
    │   │       ├── guided_decoding
    │   │       │   ├── __init__.py
    │   │       │   ├── guided_fields.py
    │   │       │   ├── lm_format_enforcer_decoding.py
    │   │       │   ├── outlines_decoding.py
    │   │       │   └── outlines_logits_processors.py
    │   │       ├── layers
    │   │       │   ├── __init__.py
    │   │       │   ├── activation.py
    │   │       │   ├── fused_moe
    │   │       │   │   ├── __init__.py
    │   │       │   │   ├── configs
    │   │       │   │   │   ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
    │   │       │   │   │   ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
    │   │       │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
    │   │       │   │   │   ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
    │   │       │   │   │   ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
    │   │       │   │   │   ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
    │   │       │   │   │   ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
    │   │       │   │   │   ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
    │   │       │   │   │   ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
    │   │       │   │   │   ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
    │   │       │   │   │   ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json
    │   │       │   │   │   ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
    │   │       │   │   │   ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json
    │   │       │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
    │   │       │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
    │   │       │   │   │   ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
    │   │       │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
    │   │       │   │   │   ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json
    │   │       │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
    │   │       │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
    │   │       │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
    │   │       │   │   │   ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
    │   │       │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
    │   │       │   │   │   ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json
    │   │       │   │   │   ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
    │   │       │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
    │   │       │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
    │   │       │   │   │   ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
    │   │       │   │   │   └── README
    │   │       │   │   ├── fused_marlin_moe.py
    │   │       │   │   ├── fused_moe.py
    │   │       │   │   ├── layer.py
    │   │       │   │   └── moe_pallas.py
    │   │       │   ├── layernorm.py
    │   │       │   ├── linear.py
    │   │       │   ├── logits_processor.py
    │   │       │   ├── mamba
    │   │       │   │   ├── __init__.py
    │   │       │   │   └── ops
    │   │       │   │   │   ├── __init__.py
    │   │       │   │   │   ├── causal_conv1d.py
    │   │       │   │   │   └── mamba_ssm.py
    │   │       │   ├── pooler.py
    │   │       │   ├── quantization
    │   │       │   │   ├── __init__.py
    │   │       │   │   ├── aqlm.py
    │   │       │   │   ├── awq.py
    │   │       │   │   ├── awq_marlin.py
    │   │       │   │   ├── awq_triton.py
    │   │       │   │   ├── base_config.py
    │   │       │   │   ├── bitsandbytes.py
    │   │       │   │   ├── compressed_tensors
    │   │       │   │   │   ├── __init__.py
    │   │       │   │   │   ├── compressed_tensors.py
    │   │       │   │   │   ├── compressed_tensors_moe.py
    │   │       │   │   │   ├── schemes
    │   │       │   │   │   │   ├── __init__.py
    │   │       │   │   │   │   ├── compressed_tensors_scheme.py
    │   │       │   │   │   │   ├── compressed_tensors_w4a16_24.py
    │   │       │   │   │   │   ├── compressed_tensors_w8a16_fp8.py
    │   │       │   │   │   │   ├── compressed_tensors_w8a8_fp8.py
    │   │       │   │   │   │   ├── compressed_tensors_w8a8_int8.py
    │   │       │   │   │   │   └── compressed_tensors_wNa16.py
    │   │       │   │   │   └── utils.py
    │   │       │   │   ├── deepspeedfp.py
    │   │       │   │   ├── experts_int8.py
    │   │       │   │   ├── fbgemm_fp8.py
    │   │       │   │   ├── fp8.py
    │   │       │   │   ├── gguf.py
    │   │       │   │   ├── gptq.py
    │   │       │   │   ├── gptq_marlin.py
    │   │       │   │   ├── gptq_marlin_24.py
    │   │       │   │   ├── ipex_quant.py
    │   │       │   │   ├── kernels
    │   │       │   │   │   ├── MPLinearKernel.py
    │   │       │   │   │   ├── __init__.py
    │   │       │   │   │   ├── machete.py
    │   │       │   │   │   └── marlin.py
    │   │       │   │   ├── kv_cache.py
    │   │       │   │   ├── marlin.py
    │   │       │   │   ├── modelopt.py
    │   │       │   │   ├── neuron_quant.py
    │   │       │   │   ├── qqq.py
    │   │       │   │   ├── schema.py
    │   │       │   │   ├── tpu_int8.py
    │   │       │   │   └── utils
    │   │       │   │   │   ├── __init__.py
    │   │       │   │   │   ├── layer_utils.py
    │   │       │   │   │   ├── machete_utils.py
    │   │       │   │   │   ├── marlin_utils.py
    │   │       │   │   │   ├── marlin_utils_fp8.py
    │   │       │   │   │   ├── marlin_utils_test.py
    │   │       │   │   │   ├── marlin_utils_test_24.py
    │   │       │   │   │   ├── marlin_utils_test_qqq.py
    │   │       │   │   │   ├── quant_utils.py
    │   │       │   │   │   └── w8a8_utils.py
    │   │       │   ├── rejection_sampler.py
    │   │       │   ├── resampler.py
    │   │       │   ├── rotary_embedding.py
    │   │       │   ├── sampler.py
    │   │       │   ├── spec_decode_base_sampler.py
    │   │       │   ├── typical_acceptance_sampler.py
    │   │       │   └── vocab_parallel_embedding.py
    │   │       ├── model_loader
    │   │       │   ├── __init__.py
    │   │       │   ├── loader.py
    │   │       │   ├── neuron.py
    │   │       │   ├── openvino.py
    │   │       │   ├── tensorizer.py
    │   │       │   ├── utils.py
    │   │       │   └── weight_utils.py
    │   │       ├── models
    │   │       │   ├── __init__.py
    │   │       │   ├── arctic.py
    │   │       │   ├── baichuan.py
    │   │       │   ├── bart.py
    │   │       │   ├── blip.py
    │   │       │   ├── blip2.py
    │   │       │   ├── bloom.py
    │   │       │   ├── chameleon.py
    │   │       │   ├── chatglm.py
    │   │       │   ├── clip.py
    │   │       │   ├── commandr.py
    │   │       │   ├── dbrx.py
    │   │       │   ├── decilm.py
    │   │       │   ├── deepseek.py
    │   │       │   ├── deepseek_v2.py
    │   │       │   ├── eagle.py
    │   │       │   ├── exaone.py
    │   │       │   ├── falcon.py
    │   │       │   ├── fuyu.py
    │   │       │   ├── gemma.py
    │   │       │   ├── gemma2.py
    │   │       │   ├── gemma2_embedding.py
    │   │       │   ├── gpt2.py
    │   │       │   ├── gpt_bigcode.py
    │   │       │   ├── gpt_j.py
    │   │       │   ├── gpt_neox.py
    │   │       │   ├── granite.py
    │   │       │   ├── granitemoe.py
    │   │       │   ├── idefics2_vision_model.py
    │   │       │   ├── interfaces.py
    │   │       │   ├── interfaces_base.py
    │   │       │   ├── intern_vit.py
    │   │       │   ├── internlm2.py
    │   │       │   ├── internvl.py
    │   │       │   ├── jais.py
    │   │       │   ├── jamba.py
    │   │       │   ├── llama.py
    │   │       │   ├── llama_embedding.py
    │   │       │   ├── llava.py
    │   │       │   ├── llava_next.py
    │   │       │   ├── llava_next_video.py
    │   │       │   ├── llava_onevision.py
    │   │       │   ├── medusa.py
    │   │       │   ├── minicpm.py
    │   │       │   ├── minicpm3.py
    │   │       │   ├── minicpmv.py
    │   │       │   ├── mixtral.py
    │   │       │   ├── mixtral_quant.py
    │   │       │   ├── mllama.py
    │   │       │   ├── mlp_speculator.py
    │   │       │   ├── module_mapping.py
    │   │       │   ├── mpt.py
    │   │       │   ├── nemotron.py
    │   │       │   ├── nvlm_d.py
    │   │       │   ├── olmo.py
    │   │       │   ├── olmoe.py
    │   │       │   ├── opt.py
    │   │       │   ├── orion.py
    │   │       │   ├── paligemma.py
    │   │       │   ├── persimmon.py
    │   │       │   ├── phi.py
    │   │       │   ├── phi3.py
    │   │       │   ├── phi3_small.py
    │   │       │   ├── phi3v.py
    │   │       │   ├── phimoe.py
    │   │       │   ├── pixtral.py
    │   │       │   ├── qwen.py
    │   │       │   ├── qwen2.py
    │   │       │   ├── qwen2_moe.py
    │   │       │   ├── qwen2_rm.py
    │   │       │   ├── qwen2_vl.py
    │   │       │   ├── registry.py
    │   │       │   ├── siglip.py
    │   │       │   ├── solar.py
    │   │       │   ├── stablelm.py
    │   │       │   ├── starcoder2.py
    │   │       │   ├── ultravox.py
    │   │       │   ├── utils.py
    │   │       │   └── xverse.py
    │   │       ├── parameter.py
    │   │       ├── pooling_metadata.py
    │   │       ├── sampling_metadata.py
    │   │       └── utils.py
    │   │   ├── multimodal
    │   │       ├── __init__.py
    │   │       ├── audio.py
    │   │       ├── base.py
    │   │       ├── image.py
    │   │       ├── registry.py
    │   │       ├── utils.py
    │   │       └── video.py
    │   │   ├── outputs.py
    │   │   ├── platforms
    │   │       ├── __init__.py
    │   │       ├── cpu.py
    │   │       ├── cuda.py
    │   │       ├── interface.py
    │   │       ├── rocm.py
    │   │       ├── tpu.py
    │   │       └── xpu.py
    │   │   ├── plugins
    │   │       └── __init__.py
    │   │   ├── pooling_params.py
    │   │   ├── prompt_adapter
    │   │       ├── __init__.py
    │   │       ├── layers.py
    │   │       ├── models.py
    │   │       ├── request.py
    │   │       ├── utils.py
    │   │       └── worker_manager.py
    │   │   ├── py.typed
    │   │   ├── sampling_params.py
    │   │   ├── scalar_type.py
    │   │   ├── scripts.py
    │   │   ├── sequence.py
    │   │   ├── spec_decode
    │   │       ├── __init__.py
    │   │       ├── batch_expansion.py
    │   │       ├── draft_model_runner.py
    │   │       ├── interfaces.py
    │   │       ├── medusa_worker.py
    │   │       ├── metrics.py
    │   │       ├── mlp_speculator_worker.py
    │   │       ├── mqa_scorer.py
    │   │       ├── multi_step_worker.py
    │   │       ├── ngram_worker.py
    │   │       ├── proposer_worker_base.py
    │   │       ├── smaller_tp_proposer_worker.py
    │   │       ├── spec_decode_worker.py
    │   │       ├── target_model_runner.py
    │   │       ├── top1_proposer.py
    │   │       └── util.py
    │   │   ├── tokenformer
    │   │       ├── __init__.py
    │   │       └── tokenformer_model_manager.py
    │   │   ├── tracing.py
    │   │   ├── transformers_utils
    │   │       ├── __init__.py
    │   │       ├── config.py
    │   │       ├── configs
    │   │       │   ├── __init__.py
    │   │       │   ├── arctic.py
    │   │       │   ├── chatglm.py
    │   │       │   ├── dbrx.py
    │   │       │   ├── eagle.py
    │   │       │   ├── exaone.py
    │   │       │   ├── falcon.py
    │   │       │   ├── internvl.py
    │   │       │   ├── jais.py
    │   │       │   ├── medusa.py
    │   │       │   ├── mllama.py
    │   │       │   ├── mlp_speculator.py
    │   │       │   ├── mpt.py
    │   │       │   ├── nemotron.py
    │   │       │   ├── nvlm_d.py
    │   │       │   ├── qwen2vl.py
    │   │       │   ├── solar.py
    │   │       │   └── ultravox.py
    │   │       ├── detokenizer.py
    │   │       ├── processor.py
    │   │       ├── tokenizer.py
    │   │       ├── tokenizer_group
    │   │       │   ├── __init__.py
    │   │       │   ├── base_tokenizer_group.py
    │   │       │   ├── ray_tokenizer_group.py
    │   │       │   └── tokenizer_group.py
    │   │       ├── tokenizers
    │   │       │   ├── __init__.py
    │   │       │   └── mistral.py
    │   │       └── utils.py
    │   │   ├── triton_utils
    │   │       ├── __init__.py
    │   │       ├── custom_cache_manager.py
    │   │       ├── importing.py
    │   │       └── libentry.py
    │   │   ├── usage
    │   │       ├── __init__.py
    │   │       └── usage_lib.py
    │   │   ├── utils.py
    │   │   ├── version.py
    │   │   ├── vllm_flash_attn
    │   │       └── .gitkeep
    │   │   └── worker
    │   │       ├── __init__.py
    │   │       ├── cache_engine.py
    │   │       ├── cpu_enc_dec_model_runner.py
    │   │       ├── cpu_model_runner.py
    │   │       ├── cpu_worker.py
    │   │       ├── embedding_model_runner.py
    │   │       ├── enc_dec_model_runner.py
    │   │       ├── model_runner.py
    │   │       ├── model_runner_base.py
    │   │       ├── multi_step_model_runner.py
    │   │       ├── multi_step_tpu_worker.py
    │   │       ├── multi_step_worker.py
    │   │       ├── neuron_model_runner.py
    │   │       ├── neuron_worker.py
    │   │       ├── openvino_model_runner.py
    │   │       ├── openvino_worker.py
    │   │       ├── tpu_model_runner.py
    │   │       ├── tpu_worker.py
    │   │       ├── utils.py
    │   │       ├── worker.py
    │   │       ├── worker_base.py
    │   │       ├── xpu_model_runner.py
    │   │       └── xpu_worker.py
    ├── csrc
    │   ├── activation_kernels.cu
    │   ├── attention
    │   │   ├── attention_dtypes.h
    │   │   ├── attention_generic.cuh
    │   │   ├── attention_kernels.cu
    │   │   ├── attention_utils.cuh
    │   │   ├── dtype_bfloat16.cuh
    │   │   ├── dtype_float16.cuh
    │   │   ├── dtype_float32.cuh
    │   │   └── dtype_fp8.cuh
    │   ├── cache.h
    │   ├── cache_kernels.cu
    │   ├── core
    │   │   ├── exception.hpp
    │   │   ├── registration.h
    │   │   ├── scalar_type.hpp
    │   │   └── torch_bindings.cpp
    │   ├── cpu
    │   │   ├── activation.cpp
    │   │   ├── attention.cpp
    │   │   ├── cache.cpp
    │   │   ├── cpu_types.hpp
    │   │   ├── cpu_types_arm.hpp
    │   │   ├── cpu_types_vsx.hpp
    │   │   ├── cpu_types_x86.hpp
    │   │   ├── dnnl_helper.hpp
    │   │   ├── layernorm.cpp
    │   │   ├── pos_encoding.cpp
    │   │   ├── quant.cpp
    │   │   ├── torch_bindings.cpp
    │   │   └── utils.cpp
    │   ├── cuda_compat.h
    │   ├── cuda_utils.h
    │   ├── cuda_utils_kernels.cu
    │   ├── custom_all_reduce.cu
    │   ├── custom_all_reduce.cuh
    │   ├── custom_all_reduce_test.cu
    │   ├── cutlass_extensions
    │   │   ├── cute_utils.cuh
    │   │   ├── torch_utils.hpp
    │   │   ├── vllm_collective_builder.cuh
    │   │   ├── vllm_custom_types.cuh
    │   │   ├── vllm_cutlass_library_extension.py
    │   │   └── vllm_numeric_conversion.cuh
    │   ├── dispatch_utils.h
    │   ├── layernorm_kernels.cu
    │   ├── mamba
    │   │   ├── causal_conv1d
    │   │   │   ├── causal_conv1d.cu
    │   │   │   ├── causal_conv1d.h
    │   │   │   └── static_switch.h
    │   │   └── mamba_ssm
    │   │   │   ├── selective_scan.h
    │   │   │   ├── selective_scan_fwd.cu
    │   │   │   └── static_switch.h
    │   ├── moe
    │   │   ├── marlin_kernels
    │   │   │   ├── marlin_moe_kernel.h
    │   │   │   ├── marlin_moe_kernel_ku4.cu
    │   │   │   ├── marlin_moe_kernel_ku4.h
    │   │   │   ├── marlin_moe_kernel_ku4b8.cu
    │   │   │   ├── marlin_moe_kernel_ku4b8.h
    │   │   │   ├── marlin_moe_kernel_ku8b128.cu
    │   │   │   └── marlin_moe_kernel_ku8b128.h
    │   │   ├── marlin_moe_ops.cu
    │   │   ├── moe_ops.h
    │   │   ├── topk_softmax_kernels.cu
    │   │   └── torch_bindings.cpp
    │   ├── moe_align_block_size_kernels.cu
    │   ├── ops.h
    │   ├── permute_cols.cu
    │   ├── pos_encoding_kernels.cu
    │   ├── prepare_inputs
    │   │   ├── advance_step.cu
    │   │   └── advance_step.cuh
    │   ├── quantization
    │   │   ├── aqlm
    │   │   │   └── gemm_kernels.cu
    │   │   ├── awq
    │   │   │   ├── dequantize.cuh
    │   │   │   └── gemm_kernels.cu
    │   │   ├── compressed_tensors
    │   │   │   └── int8_quant_kernels.cu
    │   │   ├── cutlass_w8a8
    │   │   │   ├── Epilogues.md
    │   │   │   ├── broadcast_load_epilogue_c2x.hpp
    │   │   │   ├── broadcast_load_epilogue_c3x.hpp
    │   │   │   ├── common.hpp
    │   │   │   ├── scaled_mm_c2x.cu
    │   │   │   ├── scaled_mm_c2x.cuh
    │   │   │   ├── scaled_mm_c2x_sm75_dispatch.cuh
    │   │   │   ├── scaled_mm_c2x_sm80_dispatch.cuh
    │   │   │   ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh
    │   │   │   ├── scaled_mm_c2x_sm89_int8_dispatch.cuh
    │   │   │   ├── scaled_mm_c3x.cu
    │   │   │   └── scaled_mm_entry.cu
    │   │   ├── fp8
    │   │   │   ├── amd
    │   │   │   │   ├── hip_float8.h
    │   │   │   │   ├── hip_float8_impl.h
    │   │   │   │   └── quant_utils.cuh
    │   │   │   ├── common.cu
    │   │   │   ├── fp8_marlin.cu
    │   │   │   └── nvidia
    │   │   │   │   └── quant_utils.cuh
    │   │   ├── gguf
    │   │   │   ├── dequantize.cuh
    │   │   │   ├── ggml-common.h
    │   │   │   ├── gguf_kernel.cu
    │   │   │   ├── mmq.cuh
    │   │   │   ├── mmvq.cuh
    │   │   │   └── vecdotq.cuh
    │   │   ├── gptq
    │   │   │   ├── compat.cuh
    │   │   │   ├── matrix_view.cuh
    │   │   │   ├── q_gemm.cu
    │   │   │   ├── qdq_2.cuh
    │   │   │   ├── qdq_3.cuh
    │   │   │   ├── qdq_4.cuh
    │   │   │   ├── qdq_8.cuh
    │   │   │   └── qdq_util.cuh
    │   │   ├── gptq_marlin
    │   │   │   ├── awq_marlin_repack.cu
    │   │   │   ├── gptq_marlin.cu
    │   │   │   ├── gptq_marlin_repack.cu
    │   │   │   ├── marlin.cuh
    │   │   │   └── marlin_dtypes.cuh
    │   │   ├── machete
    │   │   │   ├── Readme.md
    │   │   │   ├── generate.py
    │   │   │   ├── machete_collective_builder.cuh
    │   │   │   ├── machete_interleaving_utils.cuh
    │   │   │   ├── machete_mainloop.cuh
    │   │   │   ├── machete_mm_kernel.cuh
    │   │   │   ├── machete_mm_launcher.cuh
    │   │   │   ├── machete_prepack_kernel.cuh
    │   │   │   ├── machete_prepack_launcher.cuh
    │   │   │   ├── machete_prepacked_layout.cuh
    │   │   │   └── machete_pytorch.cu
    │   │   └── marlin
    │   │   │   ├── dense
    │   │   │       ├── LICENSE
    │   │   │       ├── common
    │   │   │       │   ├── base.h
    │   │   │       │   └── mem.h
    │   │   │       └── marlin_cuda_kernel.cu
    │   │   │   ├── qqq
    │   │   │       └── marlin_qqq_gemm_kernel.cu
    │   │   │   └── sparse
    │   │   │       ├── LICENSE
    │   │   │       ├── common
    │   │   │           ├── base.h
    │   │   │           ├── mem.h
    │   │   │           └── mma.h
    │   │   │       └── marlin_24_cuda_kernel.cu
    │   ├── rocm
    │   │   ├── attention.cu
    │   │   ├── ops.h
    │   │   └── torch_bindings.cpp
    │   └── torch_bindings.cpp
    ├── requirements-vllm-build.txt
    ├── requirements-vllm.txt
    ├── setup.py
    ├── slurm_configs
    │   ├── cgroup.conf
    │   ├── gres.conf
    │   ├── munge.key
    │   ├── slurm.conf
    │   └── slurm.key
    ├── slurm_src
    │   ├── cgroup_docker.c
    │   └── compile.sh
    └── util
    │   └── plot_training.py
├── ml
    ├── cray_megatron
    │   ├── collectives
    │   │   ├── data_parallelism.py
    │   │   └── main_rank_only.py
    │   ├── huggingface
    │   │   └── download_model.py
    │   ├── main.py
    │   ├── megatron
    │   │   ├── dataset
    │   │   │   ├── data_loader.py
    │   │   │   └── load_dataset.py
    │   │   ├── distribution
    │   │   │   └── apply_distribution_strategy.py
    │   │   ├── megatron_trainer.py
    │   │   └── training_loop.py
    │   └── models
    │   │   ├── does_any_checkpoint_exist.py
    │   │   ├── get_latest_checkpoint_path.py
    │   │   ├── get_model_manager.py
    │   │   ├── model_manager_base.py
    │   │   └── tokenformer
    │   │       ├── load_tokenformer_model.py
    │   │       └── tokenformer_model_manager.py
    └── tokenformer
    │   ├── llama_tokenformer_layers.py
    │   ├── llama_tokenformer_model.py
    │   ├── tokenformer_surgeon.py
    │   └── transformers_tokenformer.py
├── requirements.txt
├── scalarlm
├── scripts
    ├── cray
    ├── start_one_server.sh
    ├── start_slurm.sh
    └── train_job_entrypoint.sh
├── sdk
    ├── masint
    │   ├── __init__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── async_supermassive_intelligence.py
    │   │   └── supermassive_intelligence.py
    │   ├── cli
    │   │   ├── __init__.py
    │   │   ├── ls.py
    │   │   ├── main.py
    │   │   ├── plot.py
    │   │   ├── squeue.py
    │   │   └── view_logs.py
    │   ├── engines
    │   │   ├── __init__.py
    │   │   ├── async_cray.py
    │   │   └── cray
    │   │   │   ├── __init__.py
    │   │   │   └── submit_training_job.py
    │   └── util
    │   │   ├── __init__.py
    │   │   ├── get_api_base.py
    │   │   └── make_api_url.py
    ├── pyproject.toml
    └── scalarlm
    │   └── __init__.py
└── test
    ├── benchmark
        ├── main.py
        ├── pytorch
        │   ├── backward.py
        │   ├── forward.py
        │   ├── gemm.py
        │   ├── memcpy.py
        │   ├── memcpy_peer.py
        │   └── mpi_p2p.py
        └── roofline
        │   ├── plot_bandwidth_sweep.py
        │   └── plot_roofline.py
    ├── deployment
        ├── embed.py
        ├── generate.py
        ├── health.py
        ├── train.py
        └── train_generate.py
    ├── infra
        ├── distribution_strategy
        │   ├── benchmark_mpi_collectives.py
        │   ├── benchmark_mpi_sendrecv.py
        │   └── test_fsdp.py
        ├── generate.py
        ├── get_results.py
        ├── health.py
        ├── openai_client.py
        ├── sanity.py
        ├── slurm.py
        ├── upload_dataset.py
        ├── vllm
        │   └── tokenformer
        │   │   └── test_tokenformer.py
        └── vllm_health.py
    ├── ml
        ├── rl
        │   ├── cs_semester.sqlite
        │   ├── mini-bird.json
        │   └── sql-reasoning.py
        ├── sql
        │   ├── data.json
        │   ├── train.py
        │   └── train_generate.py
        └── tokenformer
        │   ├── test_llama_tokenformer_model.py
        │   ├── test_tokenformer.py
        │   └── test_tokenformer_surgeon.py
    └── requirements-pytest.txt


/.github/workflows/depot-amd.yml:
--------------------------------------------------------------------------------
 1 | name: Build AMD image using depot
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | 
 7 | jobs:
 8 |   docker-image:
 9 |     runs-on: depot-ubuntu-24.04-8
10 |     steps:
11 |       - name: Checkout repo
12 |         uses: actions/checkout@v3
13 | 
14 |       - name: Set up Depot CLI
15 |         uses: depot/setup-action@v1
16 | 
17 |       - name: Login to DockerHub
18 |         uses: docker/login-action@v2
19 |         with:
20 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
21 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
22 | 
23 |       - name: Build and push
24 |         uses: depot/build-push-action@v1
25 |         env:
26 |           DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }}
27 |         with:
28 |           # if no depot.json file is at the root of your repo, you must specify the project id
29 |           project: 39xfdrxfqt
30 |           push: true
31 |           tags: tensorwave/scalarlm-amd:latest
32 |           build-args: |
33 |             BASE_NAME=amd
34 |             VLLM_TARGET_DEVICE=rocm
35 |             PYTORCH_ROCM_ARCH=gfx90a;gfx942
36 |             MAX_JOBS=8
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/.github/workflows/depot-cpu.yml:
--------------------------------------------------------------------------------
 1 | name: Build CPU image using depot
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | 
 7 | jobs:
 8 |   docker-image:
 9 |     runs-on: depot-ubuntu-22.04-8
10 |     steps:
11 |       - name: Checkout repo
12 |         uses: actions/checkout@v3
13 | 
14 |       - name: Set up Depot CLI
15 |         uses: depot/setup-action@v1
16 | 
17 |       - name: Login to DockerHub
18 |         uses: docker/login-action@v2
19 |         with:
20 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
21 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
22 | 
23 |       - name: Build and push
24 |         uses: depot/build-push-action@v1
25 |         env:
26 |           DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }}
27 |         with:
28 |           # if no depot.json file is at the root of your repo, you must specify the project id
29 |           project: 39xfdrxfqt
30 |           push: true
31 |           tags: tensorwave/scalarlm-cpu:latest
32 |           build-args: |
33 |             BASE_NAME=cpu
34 |             VLLM_TARGET_DEVICE=cpu
35 |             TORCH_CUDA_ARCH_LIST=""
36 |             MAX_JOBS=8
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/.github/workflows/depot-nvidia-8.0.yml:
--------------------------------------------------------------------------------
 1 | name: Build NVIDIA CUDA 8.0 image using depot
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | 
 7 | jobs:
 8 |   docker-image:
 9 |     runs-on: depot-ubuntu-22.04
10 |     steps:
11 |       - name: Checkout repo
12 |         uses: actions/checkout@v3
13 | 
14 |       - name: Set up Depot CLI
15 |         uses: depot/setup-action@v1
16 | 
17 |       - name: Login to DockerHub
18 |         uses: docker/login-action@v2
19 |         with:
20 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
21 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
22 | 
23 |       - name: Build and push
24 |         uses: depot/build-push-action@v1
25 |         env:
26 |           DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }}
27 |         with:
28 |           # if no depot.json file is at the root of your repo, you must specify the project id
29 |           project: 39xfdrxfqt
30 |           push: true
31 |           tags: gdiamos/scalarlm-nvidia-8.0:latest
32 |           build-args: |
33 |             BASE_NAME=nvidia
34 |             VLLM_TARGET_DEVICE=cuda
35 |             TORCH_CUDA_ARCH_LIST=8.0
36 |             MAX_JOBS=2
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/.github/workflows/depot-nvidia-8.6.yml:
--------------------------------------------------------------------------------
 1 | name: Build NVIDIA CUDA 8.6 image using depot
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | 
 7 | jobs:
 8 |   docker-image:
 9 |     runs-on: depot-ubuntu-22.04
10 |     steps:
11 |       - name: Checkout repo
12 |         uses: actions/checkout@v3
13 | 
14 |       - name: Set up Depot CLI
15 |         uses: depot/setup-action@v1
16 | 
17 |       - name: Login to DockerHub
18 |         uses: docker/login-action@v2
19 |         with:
20 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
21 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
22 | 
23 |       - name: Build and push
24 |         uses: depot/build-push-action@v1
25 |         env:
26 |           DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }}
27 |         with:
28 |           # if no depot.json file is at the root of your repo, you must specify the project id
29 |           project: 39xfdrxfqt
30 |           push: true
31 |           tags: tensorwave/scalarlm-nvidia-8.6:latest
32 |           build-args: |
33 |             BASE_NAME=nvidia
34 |             VLLM_TARGET_DEVICE=cuda
35 |             TORCH_CUDA_ARCH_LIST=8.6
36 |             MAX_JOBS=2
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/.github/workflows/depot-nvidia.yml:
--------------------------------------------------------------------------------
 1 | name: Build NVIDIA image using depot
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | 
 7 | jobs:
 8 |   docker-image:
 9 |     runs-on: depot-ubuntu-22.04-8
10 |     steps:
11 |       - name: Checkout repo
12 |         uses: actions/checkout@v3
13 | 
14 |       - name: Set up Depot CLI
15 |         uses: depot/setup-action@v1
16 | 
17 |       - name: Login to DockerHub
18 |         uses: docker/login-action@v2
19 |         with:
20 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
21 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
22 | 
23 |       - name: Build and push
24 |         uses: depot/build-push-action@v1
25 |         env:
26 |           DEPOT_TOKEN: ${{ secrets.DEPOT_TOKEN }}
27 |         with:
28 |           # if no depot.json file is at the root of your repo, you must specify the project id
29 |           project: 39xfdrxfqt
30 |           push: true
31 |           tags: tensorwave/scalarlm-nvidia:latest
32 |           build-args: |
33 |             BASE_NAME=nvidia
34 |             VLLM_TARGET_DEVICE=cuda
35 |             TORCH_CUDA_ARCH_LIST=7.0 7.5 8.0 8.6 8.9 9.0
36 |             MAX_JOBS=8
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Build and run unit tests
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | 
 7 | jobs:
 8 |   docker-image:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Checkout repo
12 |         uses: actions/checkout@v3
13 | 
14 |       - name: Run tests
15 |         run: >
16 |             ./cray test
17 | 
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/*.swp
 2 | **/*.so
 3 | models/*
 4 | infra/slurm_configs/slurm.conf
 5 | scripts/cray
 6 | 
 7 | *.DS_Store
 8 | **/__pycache__/
 9 | .env
10 | .idea
11 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |   - repo: https://github.com/psf/black
3 |     rev: 24.10.0
4 |     hooks:
5 |       - id: black
6 | 
7 | 


--------------------------------------------------------------------------------
/cmd/bashly.sh:
--------------------------------------------------------------------------------
 1 | # e exit on first failure
 2 | # x all executed commands are printed to the terminal
 3 | # u unset variables are errors
 4 | # a export all variables to the environment
 5 | # E any trap on ERR is inherited by shell functions
 6 | # -o pipefail | produces a failure code if any stage fails
 7 | set -Eeuoxa pipefail
 8 | 
 9 | # Get the directory of this script
10 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
11 | 
12 | TTY=-t
13 | if test -t 0; then
14 |   TTY=-it
15 | fi
16 | 
17 | # Run the docker container
18 | docker run --rm $TTY --user $(id -u):$(id -g) \
19 |     --volume "$LOCAL_DIRECTORY:/app/cmd" \
20 |     --volume "$LOCAL_DIRECTORY/../scripts:/app/scripts" \
21 |     --volume "$LOCAL_DIRECTORY/bashly-settings.yml:/app/bashly-settings.yml" \
22 |     dannyben/bashly "$@"
23 | 


--------------------------------------------------------------------------------
/cmd/benchmark_command.sh:
--------------------------------------------------------------------------------
 1 | inspect_args
 2 | 
 3 | target=${args[target]}
 4 | visible_gpus=${args[visible-gpus]}
 5 | 
 6 | ./cray build-image $target
 7 | 
 8 | declare -a benchmark_command_parts
 9 | benchmark_command_parts=(
10 |       "CUDA_VISIBLE_DEVICES=${visible_gpus}" "python" "/app/cray/test/benchmark/main.py"
11 | )
12 | 
13 | benchmark_command="${benchmark_command_parts[*]}"
14 | 
15 | echo $command
16 | 
17 | # Get the directory of this script
18 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
19 | 
20 | # Set cwd to the project root directory
21 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/..
22 | 
23 | declare -a docker_command_parts
24 | 
25 | # Make sure the data directory exists
26 | mkdir -p $ROOT_DIRECTORY/data
27 | 
28 | docker_command_parts=("docker" "run" "-it" "--rm" "--network" "host" "-v" "$ROOT_DIRECTORY/data:/app/cray/data")
29 | 
30 | declare -a gpu_options
31 | 
32 | # Set the GPU options depending on the target
33 | if [ "$target" == "cpu" ]; then
34 |     gpu_options+=()
35 | elif [ "$target" == "amd" ]; then
36 |     gpu_options+=("--device" "/dev/kfd" "--device" "/dev/dri")
37 | else
38 |     gpu_options+=("--gpus" "all")
39 | fi
40 | 
41 | docker_command_parts+=("${gpu_options[@]}")
42 | docker_command_parts+=("cray:latest" "sh" "-c" "'$benchmark_command'")
43 | 
44 | docker_command="${docker_command_parts[*]}"
45 | echo $docker_command
46 | eval $docker_command
47 | 
48 | 


--------------------------------------------------------------------------------
/cmd/build_image_command.sh:
--------------------------------------------------------------------------------
 1 | inspect_args
 2 | 
 3 | target=${args[target]}
 4 | 
 5 | declare -a vllm_target_device
 6 | declare -a docker_platform
 7 | 
 8 | # If target is cpu, build the image with the cpu base image
 9 | if [ "$target" == "cpu" ]; then
10 |     vllm_target_device=("cpu")
11 |     if [ "$(uname -m)" == "x86_64" ]; then
12 |         docker_platform=("linux/amd64")
13 |     else
14 |         docker_platform=("linux/arm64/v8")
15 |     fi
16 | elif [ "$target" == "amd" ]; then
17 |     vllm_target_device=("rocm")
18 |     docker_platform=("linux/amd64")
19 | else
20 |     vllm_target_device=("cuda")
21 |     docker_platform=("linux/amd64")
22 | fi
23 | 
24 | docker_build_command="docker build --platform ${docker_platform} --build-arg BASE_NAME=${target} --build-arg VLLM_TARGET_DEVICE=${vllm_target_device} -t cray:latest --shm-size=8g ."
25 | 
26 | # Run docker build command
27 | echo $(green_bold Building image with command: ${docker_build_command})
28 | eval $docker_build_command
29 | 
30 | echo $(green_bold Successfully built image)
31 | 


--------------------------------------------------------------------------------
/cmd/depot_build_command.sh:
--------------------------------------------------------------------------------
 1 | inspect_args
 2 | 
 3 | target=${args[target]}
 4 | 
 5 | declare -a vllm_target_device
 6 | declare -a docker_platform
 7 | 
 8 | # If target is cpu, build the image with the cpu base image
 9 | if [ "$target" == "cpu" ]; then
10 |     vllm_target_device=("cpu")
11 |     docker_platform=("linux/amd64")
12 | elif [ "$target" == "arm" ]; then
13 |     vllm_target_device=("cpu")
14 |     docker_platform=("linux/arm64/v8")
15 | elif [ "$target" == "amd" ]; then
16 |     vllm_target_device=("rocm")
17 |     docker_platform=("linux/amd64")
18 | else
19 |     vllm_target_device=("cuda")
20 |     docker_platform=("linux/amd64")
21 | fi
22 | 
23 | docker_build_command="depot build --platform ${docker_platform} --build-arg BASE_NAME=${target} --build-arg VLLM_TARGET_DEVICE=${vllm_target_device} -t gdiamos/cray-${target}:latest --push ."
24 | 
25 | # Run docker build command
26 | echo $(green_bold Building image with command: ${docker_build_command})
27 | eval $docker_build_command
28 | 
29 | echo $(green_bold Successfully built image)
30 | 
31 | 


--------------------------------------------------------------------------------
/cmd/llm_logs_command.sh:
--------------------------------------------------------------------------------
 1 | inspect_args
 2 | 
 3 | model=${args[model]}
 4 | tail=${args[--tail]}
 5 | lines=${args[--lines]}
 6 | follow=${args[--follow]}
 7 | 
 8 | if [ -z "$model" ]; then
 9 |     model="latest"
10 | fi
11 | 
12 | ./cray build-image
13 | 
14 | declare -a log_command_parts
15 | log_command_parts=(
16 |       "python" "/app/cray/sdk/masint/cli/main.py" "logs" "--model" "$model" "--lines" "$lines"
17 | )
18 | 
19 | echo $tail
20 | 
21 | # If tail exists, add it to the command
22 | if [ -n "$tail" ]; then
23 |     log_command_parts+=("--tail")
24 | fi
25 | 
26 | # If follow exists, add it to the command
27 | if [ -n "$follow" ]; then
28 |     log_command_parts+=("--follow")
29 | fi
30 | 
31 | log_command="${log_command_parts[*]}"
32 | 
33 | echo $command
34 | 
35 | declare -a docker_command_parts
36 | 
37 | docker_command_parts=("docker" "run" "-it" "--rm" "--network" "host")
38 | 
39 | docker_command_parts+=("cray:latest" "sh" "-c" "'$log_command'")
40 | 
41 | docker_command="${docker_command_parts[*]}"
42 | echo $docker_command
43 | eval $docker_command
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/cmd/llm_ls_command.sh:
--------------------------------------------------------------------------------
 1 | inspect_args
 2 | 
 3 | ./cray build-image
 4 | 
 5 | declare -a ls_command_parts
 6 | ls_command_parts=(
 7 |       "python" "/app/cray/sdk/masint/cli/main.py" "ls"
 8 | )
 9 | 
10 | ls_command="${ls_command_parts[*]}"
11 | 
12 | echo $command
13 | 
14 | # Get the directory of this script
15 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
16 | 
17 | # Set cwd to the project root directory
18 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/..
19 | 
20 | declare -a docker_command_parts
21 | 
22 | # Make sure the data directory exists
23 | mkdir -p $ROOT_DIRECTORY/data
24 | 
25 | docker_command_parts=("docker" "run" "--rm" "--network" "host")
26 | 
27 | docker_command_parts+=("cray:latest" "sh" "-c" "'$ls_command'")
28 | 
29 | docker_command="${docker_command_parts[*]}"
30 | echo $docker_command
31 | eval $docker_command
32 | 
33 | 


--------------------------------------------------------------------------------
/cmd/llm_plot_command.sh:
--------------------------------------------------------------------------------
 1 | inspect_args
 2 | 
 3 | model=${args[model]}
 4 | 
 5 | if [ -z "$model" ]; then
 6 |     model="latest"
 7 | fi
 8 | 
 9 | ./cray build-image
10 | 
11 | declare -a plot_command_parts
12 | plot_command_parts=(
13 |       "python" "/app/cray/sdk/masint/cli/main.py" "plot" "--model" "$model"
14 | )
15 | 
16 | plot_command="${plot_command_parts[*]}"
17 | 
18 | echo $command
19 | 
20 | # Get the directory of this script
21 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
22 | 
23 | # Set cwd to the project root directory
24 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/..
25 | 
26 | declare -a docker_command_parts
27 | 
28 | # Make sure the data directory exists
29 | mkdir -p $ROOT_DIRECTORY/data
30 | 
31 | docker_command_parts=("docker" "run" "--rm" "-v" "$ROOT_DIRECTORY/data:/app/cray/data" "--network" "host")
32 | 
33 | docker_command_parts+=("cray:latest" "sh" "-c" "'$plot_command'")
34 | 
35 | docker_command="${docker_command_parts[*]}"
36 | echo $docker_command
37 | eval $docker_command
38 | 
39 | 


--------------------------------------------------------------------------------
/cmd/llm_squeue_command.sh:
--------------------------------------------------------------------------------
 1 | inspect_args
 2 | 
 3 | ./cray build-image
 4 | 
 5 | declare -a squeue_command_parts
 6 | squeue_command_parts=(
 7 |       "python" "/app/cray/sdk/masint/cli/main.py" "squeue"
 8 | )
 9 | 
10 | squeue_command="${squeue_command_parts[*]}"
11 | 
12 | echo $command
13 | 
14 | # Get the directory of this script
15 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
16 | 
17 | # Set cwd to the project root directory
18 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/..
19 | 
20 | declare -a docker_command_parts
21 | 
22 | # Make sure the data directory exists
23 | mkdir -p $ROOT_DIRECTORY/data
24 | 
25 | docker_command_parts=("docker" "run" "--rm" "--network" "host")
26 | 
27 | docker_command_parts+=("cray:latest" "sh" "-c" "'$squeue_command'")
28 | 
29 | docker_command="${docker_command_parts[*]}"
30 | echo $docker_command
31 | eval $docker_command
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/cmd/pypi_command.sh:
--------------------------------------------------------------------------------
 1 | inspect_args
 2 | 
 3 | # Get the directory of this script
 4 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 5 | 
 6 | # Set cwd to the project sdk directory
 7 | ROOT_DIRECTORY=$LOCAL_DIRECTORY/../sdk
 8 | 
 9 | cd $ROOT_DIRECTORY
10 | 
11 | # Build sdk wheel from sdk/pyproject.toml
12 | wheel_build_command="python -m build --sdist --wheel --outdir dist/ ."
13 | 
14 | # Run sdk wheel build
15 | echo $(green_bold Building wheel with command: ${wheel_build_command})
16 | eval $wheel_build_command
17 | 
18 | echo $(green_bold Successfully built wheel)
19 | 
20 | # Upload wheel to pypi
21 | pypi_upload_command="twine upload dist/*"
22 | 
23 | # Run pypi upload command
24 | echo $(green_bold Uploading wheel to pypi with command: ${pypi_upload_command})
25 | eval $pypi_upload_command
26 | 
27 | echo $(green_bold Successfully uploaded wheel to pypi)
28 | 
29 | 


--------------------------------------------------------------------------------
/cmd/up_command.sh:
--------------------------------------------------------------------------------
 1 | inspect_args
 2 | 
 3 | target=${args[target]}
 4 | 
 5 | declare -a vllm_target_device
 6 | declare -a docker_compose_service
 7 | 
 8 | if [ "$target" == "cpu" ]; then
 9 |     vllm_target_device=("cpu")
10 |     docker_compose_service="cray"
11 | elif [ "$target" == "amd" ]; then
12 |     vllm_target_device=("rocm")
13 |     docker_compose_service="cray-amd"
14 | else
15 |     vllm_target_device=("cuda")
16 |     docker_compose_service="cray-nvidia"
17 | fi
18 | 
19 | BASE_NAME=${target} VLLM_TARGET_DEVICE=${vllm_target_device} docker compose -f docker-compose.yaml up ${docker_compose_service} --build --force-recreate
20 | 


--------------------------------------------------------------------------------
/deployment/ansible/hosts:
--------------------------------------------------------------------------------
1 | ini
2 | [localhost]
3 | localhost ansible_connection=local
4 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the ScalarLM service
4 | name: scalarlm
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | gotemplate
 2 | {{- define "scalarlm.fullname" -}}
 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 4 | {{- end -}}
 5 | 
 6 | {{- define "scalarlm.vllmname" -}}
 7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 8 | {{- end -}}
 9 | 
10 | {{- define "scalarlm.labels" -}}
11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }}
12 | app.kubernetes.io/instance: {{ .Release.Name }}
13 | {{- end -}}
14 | 
15 | {{- define "scalarlm.vllmlabels" -}}
16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }}
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | {{- end -}}
19 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/templates/api_configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/api_configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-api-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |     vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}"
12 |     server_list: api
13 |     max_train_time: {{ .Values.max_train_time }}
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/templates/api_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "scalarlm.fullname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.api_port }}
 9 |       targetPort: 8000
10 |       protocol: TCP
11 |       name: http
12 |   externalIPs:
13 |     - {{ .Values.service.externalIP }}
14 |   selector:
15 |     {{- include "scalarlm.labels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/templates/vllm_configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-vllm-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |     api_url: "http://scalarlm:{{ .Values.service.api_port }}"
12 |     server_list: vllm
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_multi_node/scalarlm/templates/vllm_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "scalarlm.vllmname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.vllm_port }}
 9 |       targetPort: 8001
10 |       protocol: TCP
11 |       name: http
12 |   externalIPs:
13 |     - {{ .Values.service.externalIP }}
14 |   selector:
15 |     {{- include "scalarlm.vllmlabels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the ScalarLM service
4 | name: scalarlm
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/local-hostpath-sc.yaml:
--------------------------------------------------------------------------------
1 | # local-hostpath-sc.yaml
2 | apiVersion: storage.k8s.io/v1
3 | kind: StorageClass
4 | metadata:
5 |   name: local-hostpath
6 | provisioner: kubernetes.io/no-provisioner
7 | volumeBindingMode: Immediate
8 | 
9 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/storageclass-clusterrole.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name: storageclass-manager
 5 | rules:
 6 |   - apiGroups: ["storage.k8s.io"]
 7 |     resources: ["storageclasses"]
 8 |     verbs: ["get", "list", "create", "delete", "patch", "update"]
 9 | 
10 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | gotemplate
 2 | {{- define "scalarlm.fullname" -}}
 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 4 | {{- end -}}
 5 | 
 6 | {{- define "scalarlm.vllmname" -}}
 7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 8 | {{- end -}}
 9 | 
10 | {{- define "scalarlm.labels" -}}
11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }}
12 | app.kubernetes.io/instance: {{ .Release.Name }}
13 | {{- end -}}
14 | 
15 | {{- define "scalarlm.vllmlabels" -}}
16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }}
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | {{- end -}}
19 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/api_configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/api_configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-api-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |     vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}"
12 |     server_list: api
13 |     max_train_time: {{ .Values.max_train_time }}
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/api_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "scalarlm.fullname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.api_port }}
 9 |       targetPort: 8000
10 |       protocol: TCP
11 |       name: http
12 |   externalIPs:
13 |     - {{ .Values.service.externalIP }}
14 |   selector:
15 |     {{- include "scalarlm.labels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/cache_pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: scalarlm-cache
 5 |   annotations:
 6 |     helm.sh/resource-policy: keep
 7 | spec:
 8 |   accessModes:
 9 |     - ReadWriteOnce
10 |   resources:
11 |     requests:
12 |       storage: {{ .Values.cache_pvc.size }}
13 |   storageClassName: {{ .Values.cache_pvc.storageClass }}
14 |   wait_until_bound: false
15 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/jobs_pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: scalarlm-jobs
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteOnce
 8 |   resources:
 9 |     requests:
10 |       storage: {{ .Values.jobs_pvc.size }}
11 |   storageClassName: {{ .Values.jobs_pvc.storageClass }}
12 |   wait_until_bound: false
13 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/vllm_configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-vllm-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |     api_url: "http://scalarlm:{{ .Values.service.api_port }}"
12 |     server_list: vllm
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/templates/vllm_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "scalarlm.vllmname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.vllm_port }}
 9 |       targetPort: 8001
10 |       protocol: TCP
11 |       name: http
12 |   externalIPs:
13 |     - {{ .Values.service.externalIP }}
14 |   selector:
15 |     {{- include "scalarlm.vllmlabels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_node/scalarlm/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   repository: sudnya/scalarlm-rocm
 3 |   tag: v0.7
 4 |   pullPolicy: Always
 5 | 
 6 | env:
 7 |   - name: HIP_VISIBLE_DEVICES
 8 |     value: "0"
 9 |   - name: ROCR_VISIBLE_DEVICES
10 |     value: "0"
11 | 
12 | service:
13 |   type: ClusterIP
14 |   api_port: 8000
15 |   vllm_port: 8001
16 |   externalIP: 10.1.81.248
17 | 
18 | jobs_pvc:
19 |   storageClass: openebs-hostpath
20 |   size: 100Gi
21 | 
22 | cache_pvc:
23 |   storageClass: openebs-hostpath
24 |   size: 32Gi
25 | 
26 | model: meta-llama/Llama-3.1-8B-Instruct
27 | max_model_length: 4096
28 | gpu_memory_utilization: 0.95
29 | 
30 | training_gpus: 2
31 | inference_gpus: 1
32 | 
33 | max_train_time: 86400
34 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_pod/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the Cray service
4 | name: cray
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_pod/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | gotemplate
 2 | {{- define "cray.fullname" -}}
 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 4 | {{- end -}}
 5 | 
 6 | {{- define "cray.labels" -}}
 7 | app.kubernetes.io/name: {{ include "cray.fullname" . }}
 8 | app.kubernetes.io/instance: {{ .Release.Name }}
 9 | {{- end -}}
10 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_pod/scalarlm/templates/configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_pod/scalarlm/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "cray.fullname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.port }}
 9 |       targetPort: {{ .Values.service.targetPort }}
10 |       protocol: TCP
11 |       name: http
12 |     - port: 8001
13 |       targetPort: 8001
14 |       protocol: TCP
15 |       name: http2
16 |   externalIPs:
17 |     - {{ .Values.service.externalIP }}
18 |   selector:
19 |     {{- include "cray.labels" . | nindent 4 }}
20 | 


--------------------------------------------------------------------------------
/deployment/helm/amd_single_pod/scalarlm/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for cray-chart.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | replicaCount: 1
 6 | 
 7 | image:
 8 |   repository: sudnya/scalarlm-rocm
 9 |   tag: latest
10 |   pullPolicy: Always
11 | 
12 | env:
13 |   - name: HIP_VISIBLE_DEVICES
14 |     value: "0"
15 |   - name: ROCR_VISIBLE_DEVICES
16 |     value: "0"
17 | service:
18 |   type: ClusterIP
19 |   port: 8000
20 |   targetPort: 8000
21 |   externalIP: 10.1.81.248
22 | 
23 | model: meta-llama/Llama-3.1-8B-Instruct
24 | max_model_length: 4096
25 | gpu_memory_utilization: 0.33
26 | 


--------------------------------------------------------------------------------
/deployment/helm/cray/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/deployment/helm/cray/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: cray
 3 | description: A Helm chart for Kubernetes
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.1.0
19 | 
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | #appVersion: "1.16.0"
25 | 


--------------------------------------------------------------------------------
/deployment/helm/cray/templates/hpa.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.autoscaling.enabled }}
 2 | apiVersion: autoscaling/v2
 3 | kind: HorizontalPodAutoscaler
 4 | metadata:
 5 |   name: {{ include "cray.fullname" . }}
 6 |   labels:
 7 |     {{- include "cray.labels" . | nindent 4 }}
 8 | spec:
 9 |   scaleTargetRef:
10 |     apiVersion: apps/v1
11 |     kind: Deployment
12 |     name: {{ include "cray.fullname" . }}
13 |   minReplicas: {{ .Values.autoscaling.minReplicas }}
14 |   maxReplicas: {{ .Values.autoscaling.maxReplicas }}
15 |   metrics:
16 |     {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
17 |     - type: Resource
18 |       resource:
19 |         name: cpu
20 |         target:
21 |           type: Utilization
22 |           averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
23 |     {{- end }}
24 |     {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
25 |     - type: Resource
26 |       resource:
27 |         name: memory
28 |         target:
29 |           type: Utilization
30 |           averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
31 |     {{- end }}
32 | {{- end }}
33 | 


--------------------------------------------------------------------------------
/deployment/helm/cray/templates/ingress.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.ingress.enabled -}}
 2 | apiVersion: networking.k8s.io/v1
 3 | kind: Ingress
 4 | metadata:
 5 |   name: {{ include "cray.fullname" . }}
 6 |   labels:
 7 |     {{- include "cray.labels" . | nindent 4 }}
 8 |   {{- with .Values.ingress.annotations }}
 9 |   annotations:
10 |     {{- toYaml . | nindent 4 }}
11 |   {{- end }}
12 | spec:
13 |   {{- with .Values.ingress.className }}
14 |   ingressClassName: {{ . }}
15 |   {{- end }}
16 |   {{- if .Values.ingress.tls }}
17 |   tls:
18 |     {{- range .Values.ingress.tls }}
19 |     - hosts:
20 |         {{- range .hosts }}
21 |         - {{ . | quote }}
22 |         {{- end }}
23 |       secretName: {{ .secretName }}
24 |     {{- end }}
25 |   {{- end }}
26 |   rules:
27 |     {{- range .Values.ingress.hosts }}
28 |     - host: {{ .host | quote }}
29 |       http:
30 |         paths:
31 |           {{- range .paths }}
32 |           - path: {{ .path }}
33 |             {{- with .pathType }}
34 |             pathType: {{ . }}
35 |             {{- end }}
36 |             backend:
37 |               service:
38 |                 name: {{ include "cray.fullname" $ }}
39 |                 port:
40 |                   number: {{ $.Values.service.port }}
41 |           {{- end }}
42 |     {{- end }}
43 | {{- end }}
44 | 


--------------------------------------------------------------------------------
/deployment/helm/cray/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "cray.fullname" . }}
 5 |   labels:
 6 |     {{- include "cray.labels" . | nindent 4 }}
 7 | spec:
 8 |   type: {{ .Values.service.type }}
 9 |   ports:
10 |     - port: {{ .Values.service.port }}
11 |       targetPort: http
12 |       protocol: TCP
13 |       name: http
14 |   selector:
15 |     {{- include "cray.selectorLabels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/cray/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceAccount.create -}}
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   name: {{ include "cray.serviceAccountName" . }}
 6 |   labels:
 7 |     {{- include "cray.labels" . | nindent 4 }}
 8 |   {{- with .Values.serviceAccount.annotations }}
 9 |   annotations:
10 |     {{- toYaml . | nindent 4 }}
11 |   {{- end }}
12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
13 | {{- end }}
14 | 


--------------------------------------------------------------------------------
/deployment/helm/cray/templates/tests/test-connection.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: "{{ include "cray.fullname" . }}-test-connection"
 5 |   labels:
 6 |     {{- include "cray.labels" . | nindent 4 }}
 7 |   annotations:
 8 |     "helm.sh/hook": test
 9 | spec:
10 |   containers:
11 |     - name: wget
12 |       image: busybox
13 |       command: ['wget']
14 |       args: ['{{ include "cray.fullname" . }}:{{ .Values.service.port }}']
15 |   restartPolicy: Never
16 | 


--------------------------------------------------------------------------------
/deployment/helm/lambda/cray/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the Cray service
4 | name: cray
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/deployment/helm/lambda/cray/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | gotemplate
 2 | {{- define "cray.fullname" -}}
 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 4 | {{- end -}}
 5 | 
 6 | {{- define "cray.labels" -}}
 7 | app.kubernetes.io/name: {{ include "cray.fullname" . }}
 8 | app.kubernetes.io/instance: {{ .Release.Name }}
 9 | {{- end -}}
10 | 


--------------------------------------------------------------------------------
/deployment/helm/lambda/cray/templates/configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/deployment/helm/lambda/cray/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "cray.fullname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.port }}
 9 |       targetPort: {{ .Values.service.targetPort }}
10 |       protocol: TCP
11 |       name: http
12 |     - port: 8001
13 |       targetPort: 8001
14 |       protocol: TCP
15 |       name: http2
16 |   externalIPs:
17 |     - {{ .Values.service.externalIP }}
18 |   selector:
19 |     {{- include "cray.labels" . | nindent 4 }}
20 | 


--------------------------------------------------------------------------------
/deployment/helm/lambda/cray/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for cray-chart.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | replicaCount: 1
 6 | 
 7 | image:
 8 |   repository: gdiamos/cray-nvidia
 9 |   tag: latest
10 |   pullPolicy: IfNotPresent
11 | 
12 | service:
13 |   type: ClusterIP
14 |   port: 8000
15 |   targetPort: 8000
16 |   externalIP: 104.171.203.79
17 | 
18 | model: meta-llama/Llama-3.2-3B-Instruct
19 | max_model_length: 4096
20 | gpu_memory_utilization: 0.33
21 | 
22 | 


--------------------------------------------------------------------------------
/deployment/helm/minikube/cray/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the Cray service
4 | name: cray
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/deployment/helm/minikube/cray/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | gotemplate
 2 | {{- define "cray.fullname" -}}
 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 4 | {{- end -}}
 5 | 
 6 | {{- define "cray.labels" -}}
 7 | app.kubernetes.io/name: {{ include "cray.fullname" . }}
 8 | app.kubernetes.io/instance: {{ .Release.Name }}
 9 | {{- end -}}
10 | 


--------------------------------------------------------------------------------
/deployment/helm/minikube/cray/templates/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: {{ include "cray.fullname" . }}
 5 | spec:
 6 |   replicas: {{ .Values.replicaCount }}
 7 |   selector:
 8 |     matchLabels:
 9 |       {{- include "cray.labels" . | nindent 6 }}
10 |   template:
11 |     metadata:
12 |       labels:
13 |         {{- include "cray.labels" . | nindent 8 }}
14 |     spec:
15 |     {{- with .Values.imagePullSecrets }}
16 |       imagePullSecrets:
17 |         {{- toYaml . | nindent 8 }}
18 |     {{- end }}
19 |       containers:
20 |         - name: {{ .Chart.Name }}
21 |           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
22 |           imagePullPolicy: {{ .Values.image.pullPolicy }}
23 |           command: ["/app/cray/scripts/start_one_server.sh"]
24 |           ports:
25 |             - name: http
26 |               containerPort: 8000
27 |               hostPort: 8000
28 |               protocol: TCP
29 |             - name: http2
30 |               containerPort: 8001
31 |               hostPort: 8001
32 |               protocol: TCP
33 |           volumeMounts:
34 |           {{- range .Values.volumes }}
35 |             - name: {{ .name }}
36 |               mountPath: {{ .path }}
37 |           {{- end }}
38 |       volumes:
39 |       {{- range .Values.volumes }}
40 |         - name: {{ .name }}
41 |           hostPath:
42 |             path: {{ .hostPath }}
43 |       {{- end }}
44 | 


--------------------------------------------------------------------------------
/deployment/helm/minikube/cray/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "cray.fullname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.port }}
 9 |       targetPort: {{ .Values.service.targetPort }}
10 |       protocol: TCP
11 |       name: http
12 |     - port: 8001
13 |       targetPort: 8001
14 |       protocol: TCP
15 |       name: http2
16 |   selector:
17 |     {{- include "cray.labels" . | nindent 4 }}
18 | 


--------------------------------------------------------------------------------
/deployment/helm/minikube/cray/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for cray-chart.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | replicaCount: 1
 6 | 
 7 | image:
 8 |   repository: gdiamos/masint-arm
 9 |   tag: latest
10 |   pullPolicy: IfNotPresent
11 | 
12 | service:
13 |   type: ClusterIP
14 |   port: 8000
15 |   targetPort: 8000
16 | 
17 | volumes:
18 |   - name: ml
19 |     path: /app/cray/ml
20 |     hostPath: /Users/gregorydiamos/checkout/cray/ml
21 | 
22 | network:
23 |   name: cray-network
24 | 
25 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the ScalarLM service
4 | name: scalarlm
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | gotemplate
 2 | {{- define "scalarlm.fullname" -}}
 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 4 | {{- end -}}
 5 | 
 6 | {{- define "scalarlm.vllmname" -}}
 7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 8 | {{- end -}}
 9 | 
10 | {{- define "scalarlm.labels" -}}
11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }}
12 | app.kubernetes.io/instance: {{ .Release.Name }}
13 | {{- end -}}
14 | 
15 | {{- define "scalarlm.vllmlabels" -}}
16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }}
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | {{- end -}}
19 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/api_configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/api_configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-api-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |     vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}"
12 |     server_list: api
13 |     max_train_time: {{ .Values.max_train_time }}
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/api_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "scalarlm.fullname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.api_port }}
 9 |       targetPort: 8000
10 |       protocol: TCP
11 |       name: http
12 |   externalIPs:
13 |     - {{ .Values.service.externalIP }}
14 |   selector:
15 |     {{- include "scalarlm.labels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/cache_pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: scalarlm-3b-cache
 5 |   annotations:
 6 |     helm.sh/resource-policy: keep
 7 | spec:
 8 |   accessModes:
 9 |     - ReadWriteOnce
10 |   resources:
11 |     requests:
12 |       storage: {{ .Values.cache_pvc.size }}
13 |   storageClassName: {{ .Values.cache_pvc.storageClass }}
14 |   wait_until_bound: false
15 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/jobs_pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: scalarlm-3b-jobs
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteOnce
 8 |   resources:
 9 |     requests:
10 |       storage: {{ .Values.jobs_pvc.size }}
11 |   storageClassName: {{ .Values.jobs_pvc.storageClass }}
12 |   wait_until_bound: false
13 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/vllm_configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-vllm-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |     api_url: "http://scalarlm:{{ .Values.service.api_port }}"
12 |     server_list: vllm
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/templates/vllm_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "scalarlm.vllmname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.vllm_port }}
 9 |       targetPort: 8001
10 |       protocol: TCP
11 |       name: http
12 |   externalIPs:
13 |     - {{ .Values.service.externalIP }}
14 |   selector:
15 |     {{- include "scalarlm.vllmlabels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave3b/scalarlm/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   repository: tensorwave/scalarlm-amd
 3 |   tag: latest
 4 |   pullPolicy: Always
 5 | 
 6 | service:
 7 |   type: ClusterIP
 8 |   api_port: 8100
 9 |   vllm_port: 8101
10 |   externalIP: 64.139.222.101
11 | 
12 | jobs_pvc:
13 |   storageClass: local-path
14 |   size: 100Gi
15 | 
16 | cache_pvc:
17 |   storageClass: local-path
18 |   size: 16Gi
19 | 
20 | model: meta-llama/Llama-3.2-3B-Instruct
21 | max_model_length: 32768
22 | gpu_memory_utilization: 0.95
23 | 
24 | training_gpus: 1
25 | inference_gpus: 1
26 | 
27 | max_train_time: 14400
28 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the ScalarLM service
4 | name: scalarlm
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | gotemplate
 2 | {{- define "scalarlm.fullname" -}}
 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 4 | {{- end -}}
 5 | 
 6 | {{- define "scalarlm.vllmname" -}}
 7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 8 | {{- end -}}
 9 | 
10 | {{- define "scalarlm.labels" -}}
11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }}
12 | app.kubernetes.io/instance: {{ .Release.Name }}
13 | {{- end -}}
14 | 
15 | {{- define "scalarlm.vllmlabels" -}}
16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }}
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | {{- end -}}
19 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/api_configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/api_configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-api-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |     vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}"
12 |     server_list: api
13 |     max_train_time: {{ .Values.max_train_time }}
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/api_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "scalarlm.fullname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.api_port }}
 9 |       targetPort: 8000
10 |       protocol: TCP
11 |       name: http
12 |   externalIPs:
13 |     - {{ .Values.service.externalIP }}
14 |   selector:
15 |     {{- include "scalarlm.labels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/cache_pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: scalarlm-70b-cache
 5 |   annotations:
 6 |     helm.sh/resource-policy: keep
 7 | spec:
 8 |   accessModes:
 9 |     - ReadWriteOnce
10 |   resources:
11 |     requests:
12 |       storage: {{ .Values.cache_pvc.size }}
13 |   storageClassName: {{ .Values.cache_pvc.storageClass }}
14 |   wait_until_bound: false
15 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/jobs_pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: scalarlm-70b-jobs
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteOnce
 8 |   resources:
 9 |     requests:
10 |       storage: {{ .Values.jobs_pvc.size }}
11 |   storageClassName: {{ .Values.jobs_pvc.storageClass }}
12 |   wait_until_bound: false
13 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/vllm_configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-vllm-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |     api_url: "http://scalarlm:{{ .Values.service.api_port }}"
12 |     server_list: vllm
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/templates/vllm_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "scalarlm.vllmname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.vllm_port }}
 9 |       targetPort: 8001
10 |       protocol: TCP
11 |       name: http
12 |   externalIPs:
13 |     - {{ .Values.service.externalIP }}
14 |   selector:
15 |     {{- include "scalarlm.vllmlabels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave70b/scalarlm/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   repository: sudnya/scalarlm-rocm
 3 |   tag: v0.8
 4 |   pullPolicy: Always
 5 | 
 6 | service:
 7 |   type: ClusterIP
 8 |   api_port: 8200
 9 |   vllm_port: 8201
10 |   externalIP: 64.139.222.101
11 | 
12 | jobs_pvc:
13 |   storageClass: local-path
14 |   size: 100Gi
15 | 
16 | cache_pvc:
17 |   storageClass: local-path
18 |   size: 200Gi
19 | 
20 | model: meta-llama/Llama-3.3-70B-Instruct
21 | max_model_length: 4096
22 | gpu_memory_utilization: 0.95
23 | 
24 | training_gpus: 2
25 | inference_gpus: 1
26 | 
27 | max_train_time: 86400
28 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: 1.0.0
3 | description: A Helm chart for the ScalarLM service
4 | name: scalarlm
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | gotemplate
 2 | {{- define "scalarlm.fullname" -}}
 3 | {{- printf "%s" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 4 | {{- end -}}
 5 | 
 6 | {{- define "scalarlm.vllmname" -}}
 7 | {{- printf "%s-vllm" .Chart.Name | trunc 63 | trimSuffix "-" -}}
 8 | {{- end -}}
 9 | 
10 | {{- define "scalarlm.labels" -}}
11 | app.kubernetes.io/name: {{ include "scalarlm.fullname" . }}
12 | app.kubernetes.io/instance: {{ .Release.Name }}
13 | {{- end -}}
14 | 
15 | {{- define "scalarlm.vllmlabels" -}}
16 | app.kubernetes.io/name: {{ include "scalarlm.vllmname" . }}
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | {{- end -}}
19 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/api_configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/api_configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-api-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |     vllm_api_url: "http://scalarlm-vllm:{{ .Values.service.vllm_port }}"
12 |     server_list: api
13 |     max_train_time: {{ .Values.max_train_time }}
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/api_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "scalarlm.fullname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.api_port }}
 9 |       targetPort: 8000
10 |       protocol: TCP
11 |       name: http
12 |   externalIPs:
13 |     - {{ .Values.service.externalIP }}
14 |   selector:
15 |     {{- include "scalarlm.labels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/cache_pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: scalarlm-8b-cache
 5 |   annotations:
 6 |     helm.sh/resource-policy: keep
 7 | spec:
 8 |   accessModes:
 9 |     - ReadWriteOnce
10 |   resources:
11 |     requests:
12 |       storage: {{ .Values.cache_pvc.size }}
13 |   storageClassName: {{ .Values.cache_pvc.storageClass }}
14 |   wait_until_bound: false
15 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/jobs_pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: scalarlm-8b-jobs
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteOnce
 8 |   resources:
 9 |     requests:
10 |       storage: {{ .Values.jobs_pvc.size }}
11 |   storageClassName: {{ .Values.jobs_pvc.storageClass }}
12 |   wait_until_bound: false
13 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/vllm_configmap.yaml:
--------------------------------------------------------------------------------
 1 | # templates/configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ .Release.Name }}-vllm-configmap
 6 | data:
 7 |   cray-config.yaml: |
 8 |     model: {{ .Values.model }}
 9 |     max_model_length: {{ .Values.max_model_length }}
10 |     gpu_memory_utilization: {{ .Values.gpu_memory_utilization }}
11 |     api_url: "http://scalarlm:{{ .Values.service.api_port }}"
12 |     server_list: vllm
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/templates/vllm_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "scalarlm.vllmname" . }}
 5 | spec:
 6 |   type: {{ .Values.service.type }}
 7 |   ports:
 8 |     - port: {{ .Values.service.vllm_port }}
 9 |       targetPort: 8001
10 |       protocol: TCP
11 |       name: http
12 |   externalIPs:
13 |     - {{ .Values.service.externalIP }}
14 |   selector:
15 |     {{- include "scalarlm.vllmlabels" . | nindent 4 }}
16 | 


--------------------------------------------------------------------------------
/deployment/helm/tensorwave8b/scalarlm/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   repository: gdiamos/scalarlm-amd #tensorwave/scalarlm-amd
 3 |   tag: fsdp
 4 |   pullPolicy: Always
 5 | 
 6 | service:
 7 |   type: ClusterIP
 8 |   api_port: 9000
 9 |   vllm_port: 9001
10 |   externalIP: 64.139.222.101
11 | 
12 | jobs_pvc:
13 |   storageClass: local-path
14 |   size: 100Gi
15 | 
16 | cache_pvc:
17 |   storageClass: local-path
18 |   size: 32Gi
19 | 
20 | model: meta-llama/Llama-3.1-8B-Instruct
21 | max_model_length: 4096
22 | gpu_memory_utilization: 0.95
23 | 
24 | training_gpus: 2
25 | inference_gpus: 1
26 | 
27 | max_train_time: 86400
28 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 | 
 3 |     cray: &cray
 4 |         command: /app/cray/scripts/start_one_server.sh
 5 |         build:
 6 |             context: .
 7 |             dockerfile: Dockerfile
 8 |             args:
 9 |                 - BASE_NAME=${BASE_NAME}
10 |                 - VLLM_TARGET_DEVICE=${VLLM_TARGET_DEVICE}
11 |         ports:
12 |             - "8000:8000"
13 |             - "8001:8001"
14 |         volumes:
15 |             - type: bind
16 |               source: ./models
17 |               target: /root/.cache/huggingface
18 |             - type: bind
19 |               source: ./infra/cray_infra
20 |               target: /app/cray/infra/cray_infra
21 |             - type: bind
22 |               source: ./scripts
23 |               target: /app/cray/scripts
24 |             - type: bind
25 |               source: ./ml
26 |               target: /app/cray/ml
27 |             - type: bind
28 |               source: ./test
29 |               target: /app/cray/test
30 |         networks:
31 |             - cray-network
32 | 
33 | 
34 |     cray-nvidia:
35 |         <<: *cray
36 |         deploy:
37 |             resources:
38 |                 reservations:
39 |                     devices:
40 |                         - driver: nvidia
41 |                           capabilities: [gpu]
42 | 
43 |     cray-amd:
44 |         <<: *cray
45 |         devices:
46 |           - /dev/kfd
47 |           - /dev/dri
48 |         security_opt:
49 |           - seccomp:unconfined
50 | 
51 | 
52 | networks:
53 |   cray-network:
54 |     name: cray_network
55 | 
56 | 


--------------------------------------------------------------------------------
/docs/cray-docs/docs/arch.md:
--------------------------------------------------------------------------------
 1 | # ScalarLM
 2 | 
 3 | ScalarLM has three high level APIs:
 4 | 
 5 | * **completions** provides OpenAI client compatibility
 6 | * **generate** provides a simple interface for generating text
 7 | * **train** provides a simple interface for submitting training jobs
 8 | 
 9 | ![ScalarLM overview](assets/cray-arch.png)
10 | 
11 | 
12 | Inference is performed by vLLM workers that are orchestrated by pulling requests from a queue.
13 | 
14 | Training is performed by Megatron-LM workers that are orchestrated by SLURM.
15 | 
16 | Trained models are automatically registered with the inference workers.
17 | 


--------------------------------------------------------------------------------
/docs/cray-docs/docs/assets/cray-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/docs/cray-docs/docs/assets/cray-arch.png


--------------------------------------------------------------------------------
/docs/cray-docs/docs/assets/cray.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/docs/cray-docs/docs/assets/cray.jpeg


--------------------------------------------------------------------------------
/docs/cray-docs/docs/assets/loss_plot_044db4ac60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/docs/cray-docs/docs/assets/loss_plot_044db4ac60.png


--------------------------------------------------------------------------------
/docs/cray-docs/docs/cli/list-models.md:
--------------------------------------------------------------------------------
 1 | # List Models
 2 | 
 3 | ```console
 4 | ./cray llm ls
 5 | ```
 6 | 
 7 | This command lists all of the models that have been trained on the ScalarLM server.
 8 | 
 9 | ```console
10 | 69118a251a074f9f9d37a2ddc903243e428d30c3c31ad019cbf62ac777e42e6e
11 | ```
12 | 
13 | ScalarLM names models with a unique identifier based on the input data and training parameters.
14 | 
15 | 


--------------------------------------------------------------------------------
/docs/cray-docs/docs/cli/plot.md:
--------------------------------------------------------------------------------
 1 | # Plot
 2 | 
 3 | ```console
 4 | ./cray llm plot
 5 | ```
 6 | 
 7 | This command plots the training loss of a specified model.
 8 | 
 9 | If no model is specified, the command will plot the training loss of the most recently trained model.
10 | 
11 | ![Plot](../assets/loss_plot_044db4ac60.png)
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/cray-docs/docs/cli/squeue.md:
--------------------------------------------------------------------------------
 1 | # squeue
 2 | 
 3 | ```console
 4 | ./cray llm squeue
 5 | ```
 6 | 
 7 | This command is a wrapper around the `squeue` command. It is used to display the status of jobs in the training queue. The output is similar to the `squeue` command, but with some additional formatting.
 8 | 
 9 | ```console
10 |              JOBID PARTITION         NAME     USER    STATE       TIME TIME_LIMI  NODES NODELIST(REASON)
11 |                  8     short 00f186ab039b     root  PENDING       0:00     20:00      1 (Priority)
12 |                  7     short f1ba9c0eb11b     root  PENDING       0:00     20:00      1 (Priority)
13 |                  6     short 0746261fd1db     root  PENDING       0:00     20:00      1 (Priority)
14 |                  5     short ae55dedbb496     root  PENDING       0:00     20:00      1 (Priority)
15 |                  4     short d2bc30a36081     root  PENDING       0:00     20:00      1 (Priority)
16 |                  3     short bce8e63a7bef     root  PENDING       0:00     20:00      1 (Resources)
17 |                  2     short c42b59ab0fb1     root  RUNNING       0:34     20:00      1 df294b9206ff
18 | ```
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/docs/cray-docs/docs/contact.md:
--------------------------------------------------------------------------------
 1 | # Contact Us
 2 | 
 3 | Project ScalarLM is developed by an Artificial Intelligence engineering consortium, built on a philosophy of open collaboration to improve AI systems. Through our collective engineering efforts with industry and academia we continually integrate and improve the accuracy, safety, speed, and efficiency of AI technologies–helping companies and universities around the world build better AI systems that will benefit society.
 4 | 
 5 | [Get in Touch](https://forms.gle/tk6LFVrTQDSQp8L69)
 6 | 
 7 | 
 8 | * Greg Diamos
 9 | * Naila Farooqui
10 | * Sudnya Diamos
11 | * Suhabe Bugrara
12 | 
13 | 
14 | We accept community contributions and are always looking for new collaborators. If you are interested in contributing to Project ScalarLM, please reach out to us at [Get in Touch](https://forms.gle/tk6LFVrTQDSQp8L69).
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/docs/cray-docs/docs/deployment/docker.md:
--------------------------------------------------------------------------------
 1 | # Docker builds
 2 | 
 3 | Check out prebuilt docker containers for different targets:
 4 | 
 5 | | Target | Container                   | Latest Release v0.5      |
 6 | -------- | --------------------------- | ------------------------ |
 7 | | NVIDIA | gdiamos/cray-nvidia:latest  | gdiamos/cray-nvidia:v0.5 |
 8 | | ARM    | gdiamos/cray-arm:latest     | gdiamos/cray-arm:v0.5    |
 9 | | AMD    | gdiamos/cray-amd:latest     | gdiamos/cray-amd:v0.5    |
10 | | x86    | gdiamos/cray-cpu:latest     | gdiamos/cray-cpu:v0.5    |
11 | 
12 | For example, to launch a development server on a modern macbook, e.g. m2
13 | 
14 | ```bash
15 | docker run -it -p 8000:8000 --entrypoint /app/cray/scripts/start_one_server.sh gdiamos/cray-arm:v0.5
16 | ```
17 | 


--------------------------------------------------------------------------------
/docs/cray-docs/docs/deployment/modal.md:
--------------------------------------------------------------------------------
 1 | # Modal
 2 | 
 3 | ScalarLM can be deployed on Modal for easy access to GPUs.
 4 | 
 5 | Clone the [ScalarLM repository](https://github.com/tensorwavecloud/scalarlm) and start the server.
 6 | 
 7 | ```console
 8 | git clone git@github.com:tensorwavecloud/scalarlm.git
 9 | cd cray
10 | ./cray deploy
11 | ```
12 | 
13 | Modal should give you an endpoint you can start using.
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/cray-docs/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to ScalarLM
 2 | 
 3 | ScalarLM is a fully open source, CC-0 Licensed, integrated LLM inference and training platform.
 4 | 
 5 | ScalarLM builds on top of the vLLM inference engine, the Megatron-LM training framework, and the HuggingFace model hub. It unifies the capabilities of these tools into a single platform, enabling users to easily perform LLM inference and training, and build higher lever applications such as Agents with a twist - they can teach themselves new abilities via back propagation.
 6 | 
 7 | ScalarLM is designed for high peformance. It inherits the distributed training capabilities of Megatron-LM and the optimized inference engine of vLLM. Cray is also designed to be easy to use. It provides an OpenAI compatible server and a simple command line interface for users to interact with the platform.
 8 | 
 9 | ScalarLM is inspired by the work of Seymour Roger Cray (September 28, 1925 – October 5, 1996), an American electrical engineer and supercomputer architect who designed a series of computers that were the fastest in the world for decades, and founded Cray Research, which built many of these machines. Called "the father of supercomputing", Cray has been credited with creating the supercomputer industry.
10 | 
11 | Learn more about ScalarLM at our [Blog](https://blog.scalarlm.com) and [GitHub](https://github.com/scalarlm/scalarlm).
12 | 
13 | [Get in Touch](https://forms.gle/tk6LFVrTQDSQp8L69)
14 | 
15 | ![ScalarLM](assets/cray.jpeg)
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/docs/cray-docs/docs/inference.md:
--------------------------------------------------------------------------------
 1 | # Inference
 2 | 
 3 | 
 4 | ## OpenAI Compatible Server
 5 | 
 6 | ```console
 7 | curl https://meta-llama--llama-3-2-3b-instruct.cray-lm.com/v1/openai/chat/completions \
 8 |     -H "Content-Type: application/json" \
 9 |     -d '{
10 |         "model": "meta-llama/Llama-3.2-3B-Instruct",
11 |         "messages": [
12 |             {"role": "system", "content": "You are a helpful assistant."},
13 |             {"role": "user", "content": "Who won the world series in 2020?"}
14 |         ]
15 |     }'
16 | ```
17 | 
18 | ## Using the Python client
19 | 
20 | You can also use the Python client to interact with the ScalarLM server.
21 | 
22 | ```python
23 | 
24 | import masint
25 | 
26 | masint.api_url = "https://meta-llama--llama-3-2-3b-instruct.cray-lm.com"
27 | 
28 | def get_dataset():
29 |     dataset = []
30 | 
31 |     count = 4
32 | 
33 |     for i in range(count):
34 |         dataset.append(f"What is {i} + {i}?")
35 | 
36 |     return dataset
37 | 
38 | 
39 | llm = masint.SupermassiveIntelligence()
40 | 
41 | dataset = get_dataset()
42 | 
43 | results = llm.generate(prompts=dataset)
44 | 
45 | print(results)
46 | ```
47 | 
48 | 


--------------------------------------------------------------------------------
/docs/cray-docs/docs/training.md:
--------------------------------------------------------------------------------
 1 | # Training
 2 | 
 3 | ## Training jobs
 4 | 
 5 | You can also use the Python client to submit training jobs to the ScalarLM server.
 6 | 
 7 | ```python
 8 | 
 9 | import masint
10 | 
11 | def get_dataset():
12 |     dataset = []
13 | 
14 |     count = 5
15 | 
16 |     for i in range(count):
17 |         dataset.append(
18 |             {"input": f"What is {i} + {i}?", "output": str(i + i)}
19 |         )
20 | 
21 |     return dataset
22 | 
23 | 
24 | llm = masint.SupermassiveIntelligence()
25 | 
26 | dataset = get_dataset()
27 | 
28 | status = llm.train(dataset, train_args={"max_steps": 200, "learning_rate": 3e-3})
29 | 
30 | print(status)
31 | ```
32 | 
33 | You get a command line output like this:
34 | 
35 | ```console
36 | (environment) gregorydiamos@Air-Gregory cray % python test/deployment/train.py
37 | {'job_id': '1', 'status': 'QUEUED', 'message': 'Training job launched', 'dataset_id': 'dataset', 'job_directory': '/app/cray/jobs/69118a251a074f9f9d37a2ddc903243e428d30c3c31ad019cbf62ac777e42e6e', 'model_name': '69118a251a074f9f9d37a2ddc903243e428d30c3c31ad019cbf62ac777e42e6e'}
38 | ```
39 | 
40 | 


--------------------------------------------------------------------------------
/docs/cray-docs/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: ScalarLM Docs
 2 | nav:
 3 |     - Getting Started:
 4 |         - Introduction: index.md
 5 |         - Quick Start: quickstart.md
 6 |         - Architecture: arch.md
 7 |         - Contact: contact.md
 8 |     - Examples:
 9 |         - Inference: inference.md
10 |         - Training: training.md
11 |     - Command Line:
12 |         - CLI: cli/cli.md
13 |         - List Models: cli/list-models.md
14 |         - Training Logs: cli/training-logs.md
15 |         - Squeue: cli/squeue.md
16 |         - Plot: cli/plot.md
17 |     - Deployment:
18 |         - Laptop: deployment/laptop.md
19 |         - Kubernetes: deployment/kubernetes.md
20 |         - Modal: deployment/modal.md
21 |         - Modal Details: deployment/modal-details.md
22 |         - Docker: deployment/docker.md
23 | 


--------------------------------------------------------------------------------
/docs/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Safely execute this bash script
 4 | # e exit on first failure
 5 | # x all executed commands are printed to the terminal
 6 | # u unset variables are errors
 7 | # a export all variables to the environment
 8 | # E any trap on ERR is inherited by shell functions
 9 | # -o pipefail | produces a failure code if any stage fails
10 | set -Eeuoxa pipefail
11 | 
12 | # Get the directory of this script
13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
14 | 
15 | # Clean up the old deployment directory
16 | rm -rf $LOCAL_DIRECTORY/gh-pages-deployment
17 | 
18 | # Clone the repository
19 | git clone git@github.com:tensorwavecloud/scalarlm $LOCAL_DIRECTORY/gh-pages-deployment
20 | 
21 | # Change to the deployment directory
22 | cd $LOCAL_DIRECTORY/gh-pages-deployment
23 | 
24 | # Change to the git branch
25 | git checkout gh-pages
26 | 
27 | # Copy the local files from cray-docs to the deployment directory
28 | cp $LOCAL_DIRECTORY/cray-docs/mkdocs.yml $LOCAL_DIRECTORY/gh-pages-deployment
29 | cp -r $LOCAL_DIRECTORY/cray-docs/docs $LOCAL_DIRECTORY/gh-pages-deployment/docs
30 | 
31 | # Add all the files to the git repository
32 | #git add .
33 | 
34 | # Commit the changes
35 | #git commit -m "Deploying the latest documentation"
36 | 
37 | # Run mkdocs gh-deploy
38 | mkdocs gh-deploy
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/frontend/assets/logo.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 400 400">
 2 |   <defs>
 3 |     <radialGradient id="blackHoleGradient" cx="50%" cy="50%" r="50%" fx="50%" fy="50%">
 4 |       <stop offset="0%" stop-color="#1A1A1A" stop-opacity="1"/>
 5 |       <stop offset="70%" stop-color="#000000" stop-opacity="0.9"/>
 6 |       <stop offset="100%" stop-color="#000000" stop-opacity="0"/>
 7 |     </radialGradient>
 8 |     <filter id="glowEffect" x="-50%" y="-50%" width="200%" height="200%">
 9 |       <feGaussianBlur class="blur" stdDeviation="15" result="coloredBlur"/>
10 |       <feMerge>
11 |         <feMergeNode in="coloredBlur"/>
12 |         <feMergeNode in="SourceGraphic"/>
13 |       </feMerge>
14 |     </filter>
15 |   </defs>
16 | 
17 |   <!-- Outer Accretion Disk Rings -->
18 |   <path d="M200 50
19 |            A150 150 0 0 1 350 200
20 |            A150 150 0 0 1 200 350
21 |            A150 150 0 0 1 50 200
22 |            A150 150 0 0 1 200 50"
23 |         fill="none"
24 |         stroke="url(#blackHoleGradient)"
25 |         stroke-width="40"
26 |         filter="url(#glowEffect)"/>
27 | 
28 |   <!-- Inner Event Horizon -->
29 |   <circle cx="200" cy="200" r="80"
30 |           fill="url(#blackHoleGradient)"
31 |           filter="url(#glowEffect)"/>
32 | </svg>
33 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/aiohttp/get_global_session.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | 
 3 | session = None
 4 | 
 5 | 
 6 | def get_global_session():
 7 |     global session
 8 |     if session is None:
 9 |         session = aiohttp.ClientSession()
10 |     return session
11 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/generate/finish_work.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.api.work_queue.inference_work_queue import get_inference_work_queue
 2 | 
 3 | from cray_infra.api.fastapi.routers.request_types.finish_work_request import FinishWorkRequests
 4 | 
 5 | import logging
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | async def finish_work(requests : FinishWorkRequests):
10 |     inference_work_queue = get_inference_work_queue()
11 | 
12 |     for request in requests.requests:
13 |         logger.debug(f"Finishing work for request {request.request_id}")
14 | 
15 |         result = inference_work_queue.get_id(id=request.request_id)
16 | 
17 |         if request.response is not None:
18 |             result["response"] = request.response
19 | 
20 |         if request.error is not None:
21 |             result["error"] = request.error
22 | 
23 |         inference_work_queue.update(id=request.request_id, item=result)
24 | 
25 |         inference_work_queue.ack(id=request.request_id)
26 | 
27 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/generate/get_results.py:
--------------------------------------------------------------------------------
1 | from cray_infra.api.fastapi.routers.request_types.get_results_request import GetResultsRequest
2 | 
3 | from cray_infra.api.fastapi.generate.poll_for_responses import poll_for_responses
4 | 
5 | 
6 | async def get_results(request: GetResultsRequest):
7 |     return await poll_for_responses(request.request_ids)
8 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/health/check_health.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.api.fastapi.aiohttp.get_global_session import get_global_session
 2 | from cray_infra.util.get_config import get_config
 3 | 
 4 | 
 5 | async def check_health():
 6 |     vllm_health = await get_vllm_health()
 7 |     api_health = "up"
 8 |     all_health = get_all_health([vllm_health, api_health])
 9 |     return {"api": "up", "vllm": vllm_health, "all": all_health}
10 | 
11 | 
12 | def get_all_health(healths):
13 |     if all(health == "up" for health in healths):
14 |         return "up"
15 | 
16 |     if all(health == "down" for health in healths):
17 |         return "down"
18 | 
19 |     return "mixed"
20 | 
21 | 
22 | async def get_vllm_health():
23 |     try:
24 |         session = get_global_session()
25 |         config = get_config()
26 |         async with session.get(config["vllm_api_url"] + "/health") as resp:
27 |             assert resp.status == 200
28 |             return "up"
29 |     except Exception as e:
30 |         return {"status": "down", "reason": str(e)}
31 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/main.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.api.fastapi.routers.openai_router import (
 2 |     openai_router,
 3 | )
 4 | from cray_infra.api.fastapi.routers.megatron_router import (
 5 |     megatron_router,
 6 | )
 7 | from cray_infra.api.fastapi.routers.health_router import (
 8 |     health_router,
 9 | )
10 | from cray_infra.api.fastapi.routers.generate_router import (
11 |     generate_router,
12 | )
13 | 
14 | from cray_infra.api.fastapi.tasks.add_megatron_tasks import (
15 |     add_megatron_tasks,
16 | )
17 | 
18 | from fastapi import FastAPI, Request
19 | from fastapi.middleware.cors import CORSMiddleware
20 | 
21 | import logging
22 | import os
23 | 
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | 
28 | app = FastAPI(lifespan=add_megatron_tasks)
29 | 
30 | app.include_router(openai_router, prefix="/v1")
31 | app.include_router(megatron_router, prefix="/v1")
32 | app.include_router(health_router, prefix="/v1")
33 | app.include_router(generate_router, prefix="/v1")
34 | 
35 | 
36 | origins = [
37 |     "http://localhost:3000",
38 | ]
39 | 
40 | app.add_middleware(
41 |     CORSMiddleware,
42 |     allow_origins=origins,
43 |     allow_credentials=True,
44 |     allow_methods=["*"],
45 |     allow_headers=["*"],
46 | )
47 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/health_router.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.api.fastapi.health.check_health import check_health
 2 | 
 3 | from fastapi import APIRouter
 4 | 
 5 | from fastapi.responses import JSONResponse
 6 | import logging
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | health_router = APIRouter(prefix="/health")
11 | 
12 | 
13 | @health_router.get("")
14 | async def health():
15 |     return await check_health()
16 | 
17 | 
18 | @health_router.get("/keepalive")
19 | async def health():
20 |     return {"status": "ok"}
21 | 
22 | 
23 | @health_router.get("/endpoints")
24 | async def list_routes():
25 |     routes = [
26 |         f"Path: {route.path}, Methods: {', '.join(route.methods)}"
27 |         for route in health_router.routes
28 |     ]
29 |     return JSONResponse(content={"endpoints": routes}, media_type="application/json")
30 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/embed_request.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from typing import Optional
 4 | 
 5 | 
 6 | class EmbedRequest(BaseModel):
 7 |     model: Optional[str] = None
 8 |     prompts: list[str]
 9 | 
10 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/finish_work_request.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from typing import Optional, Union
 4 | 
 5 | 
 6 | class FinishWorkRequest(BaseModel):
 7 |     request_id: int
 8 |     response: Optional[Union[str, list[float]]] = None
 9 |     error: Optional[str] = None
10 | 
11 | 
12 | class FinishWorkRequests(BaseModel):
13 |     requests: list[FinishWorkRequest]
14 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/generate_request.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from typing import Optional
 4 | 
 5 | 
 6 | class GenerateRequest(BaseModel):
 7 |     model: Optional[str] = None
 8 |     prompts: list[str]
 9 |     max_tokens: Optional[int] = 16
10 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/generate_response.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from typing import Optional, Union
 4 | 
 5 | class Result(BaseModel):
 6 |     request_id: int
 7 |     response: Optional[Union[str, list[float]]] = None
 8 |     error: Optional[str] = None
 9 | 
10 | class GenerateResponse(BaseModel):
11 |     results: list[Result]
12 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/get_results_request.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | from typing import Optional
4 | 
5 | class GetResultsRequest(BaseModel):
6 |     request_ids: list[int]
7 | 
8 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/get_results_response.py:
--------------------------------------------------------------------------------
1 | from cray_infra.api.fastapi.routers.request_types.generate_response import GenerateResponse as GetResultsResponse
2 | 
3 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/get_work_request.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | from typing import Optional
4 | 
5 | class GetWorkRequest(BaseModel):
6 |     batch_size: int
7 | 
8 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/get_work_response.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from typing import Optional
 4 | 
 5 | 
 6 | class GetWorkResponse(BaseModel):
 7 |     prompt: str
 8 |     request_id: int
 9 |     request_type: str
10 |     model: Optional[str] = None
11 |     max_tokens: Optional[int] = None
12 | 
13 | 
14 | class GetWorkResponses(BaseModel):
15 |     requests: list[GetWorkResponse]
16 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/list_models_response.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | 
4 | class ListModelsResponse(BaseModel):
5 |     models: list[dict]
6 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/squeue_response.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | from typing import Optional
4 | 
5 | 
6 | class SqueueResponse(BaseModel):
7 |     squeue_output : Optional[str] = None
8 |     error_message : Optional[str] = None
9 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/routers/request_types/train_request.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from typing import Optional
 4 | 
 5 | 
 6 | class TrainResponse(BaseModel):
 7 |     job_status: dict
 8 |     job_config: dict
 9 |     deployed: Optional[bool] = False
10 | 


--------------------------------------------------------------------------------
/infra/cray_infra/api/fastapi/tasks/add_megatron_tasks.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.util.get_config import get_config
 2 | 
 3 | from cray_infra.training.restart_megatron_jobs import restart_megatron_jobs
 4 | from cray_infra.training.register_megatron_models import register_megatron_models
 5 | from cray_infra.generate.clear_acked_requests_from_queue import clear_acked_requests_from_queue
 6 | 
 7 | from fastapi_utils.tasks import repeat_every
 8 | 
 9 | from contextlib import asynccontextmanager
10 | 
11 | import traceback
12 | import sys
13 | import logging
14 | 
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | @asynccontextmanager
19 | async def add_megatron_tasks(app):
20 |     config = get_config()
21 | 
22 |     megatron_refresh_period = config["megatron_refresh_period"]
23 | 
24 |     @repeat_every(seconds=megatron_refresh_period)
25 |     async def run_megatron_tasks():
26 |         try:
27 |             await register_megatron_models()
28 |             await restart_megatron_jobs()
29 |             await clear_acked_requests_from_queue()
30 |         except Exception as e:
31 |             print_exception()
32 |             raise e
33 | 
34 |     await run_megatron_tasks()
35 | 
36 |     yield
37 | 
38 | 
39 | def print_exception():
40 |     exc_type, exc_value, exc_traceback = sys.exc_info()
41 |     messages = traceback.format_exception(exc_type, exc_value, exc_traceback)
42 | 
43 |     logger.error("".join(messages))
44 | 


--------------------------------------------------------------------------------
/infra/cray_infra/generate/clear_acked_requests_from_queue.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.api.work_queue.inference_work_queue import get_inference_work_queue
 2 | 
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | async def clear_acked_requests_from_queue():
 8 |     inference_work_queue = get_inference_work_queue()
 9 | 
10 |     starting_size = len(inference_work_queue)
11 | 
12 |     inference_work_queue.clear_acked_data()
13 | 
14 |     ending_size = len(inference_work_queue)
15 | 
16 |     logger.info(f"Cleared {starting_size - ending_size} acked requests from the queue.")
17 | 
18 | 


--------------------------------------------------------------------------------
/infra/cray_infra/one_server/create_api.py:
--------------------------------------------------------------------------------
 1 | import uvicorn
 2 | 
 3 | 
 4 | async def create_api(port, running_status):
 5 |     server_config = uvicorn.Config(
 6 |         "cray_infra.api.fastapi.main:app",
 7 |         host="0.0.0.0",
 8 |         port=port,
 9 |         log_level="info",
10 |     )
11 |     server = uvicorn.Server(server_config)
12 |     running_status.servers.append(server)
13 | 
14 |     await server.serve()
15 | 


--------------------------------------------------------------------------------
/infra/cray_infra/one_server/create_vllm.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.util.get_config import get_config
 2 | 
 3 | from vllm.entrypoints.openai.api_server import run_server
 4 | from vllm.entrypoints.openai.cli_args import make_arg_parser
 5 | from vllm.utils import FlexibleArgumentParser
 6 | 
 7 | import torch
 8 | 
 9 | import uvicorn
10 | import os
11 | 
12 | import logging
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | async def create_vllm(port, running_status):
17 | 
18 |     os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"
19 | 
20 |     config = get_config()
21 | 
22 |     parser = FlexibleArgumentParser(
23 |         description="vLLM OpenAI-Compatible RESTful API server."
24 |     )
25 |     parser = make_arg_parser(parser)
26 |     args = [
27 |         f"--dtype={config['dtype']}",
28 |         f"--max-model-len={config['max_model_length']}",
29 |         f"--max-num-batched-tokens={config['max_model_length']}",
30 |         f"--max-seq-len-to-capture={config['max_model_length']}",
31 |         f"--gpu-memory-utilization={config['gpu_memory_utilization']}",
32 |         f"--max-log-len={config['max_log_length']}",
33 |         f"--swap-space=0",
34 |         "--enable-lora",
35 |         "--disable-async-output-proc", # Disable async output processing for embeddings
36 |     ]
37 | 
38 |     if torch.cuda.is_available():
39 |         args.append("--device=cuda")
40 | 
41 |     args = parser.parse_args(args=args)
42 | 
43 |     args.port = port
44 |     args.model = config["model"]
45 | 
46 |     logger.info(f"Running vLLM with args: {args}")
47 | 
48 |     await run_server(args, running_status)
49 | 


--------------------------------------------------------------------------------
/infra/cray_infra/one_server/start_cray_server.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.one_server.create_api import create_api
 2 | from cray_infra.one_server.create_vllm import create_vllm
 3 | 
 4 | import asyncio
 5 | import logging
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | async def start_cray_server(server_list: list):
11 | 
12 |     running_status = ServerStatus()
13 | 
14 |     logger.debug(f"Starting servers: {server_list}")
15 | 
16 |     if ("api" in server_list) or ("all" in server_list):
17 |         logger.debug("Starting API server")
18 |         api_task = asyncio.create_task(
19 |             create_api(port=8000, running_status=running_status)
20 |         )
21 |         running_status.tasks.append(api_task)
22 | 
23 |     if ("vllm" in server_list) or ("all" in server_list):
24 |         logger.debug("Starting VLLM server")
25 |         vllm_task = asyncio.create_task(
26 |             create_vllm(port=8001, running_status=running_status)
27 |         )
28 |         running_status.tasks.append(vllm_task)
29 | 
30 |     return running_status
31 | 
32 | 
33 | class ServerStatus:
34 |     def __init__(self):
35 |         self.servers = []
36 |         self.tasks = []
37 | 
38 |     async def shutdown(self):
39 |         for task in self.tasks:
40 |             logger.debug(f"Task {task} is cancelled")
41 |             task.cancel()
42 | 
43 |         for server in self.servers:
44 |             logger.debug(f"Server {server} is cancelled")
45 |             await server.shutdown()
46 | 


--------------------------------------------------------------------------------
/infra/cray_infra/one_server/wait_for_vllm.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.util.get_config import get_config
 2 | 
 3 | import asyncio
 4 | import aiohttp
 5 | 
 6 | import logging
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | async def wait_for_vllm():
12 |     for _ in range(30):
13 |         health_status = await get_vllm_health()
14 |         if health_status == 200:
15 |             return
16 |         await asyncio.sleep(1)
17 | 
18 | 
19 | async def get_vllm_health():
20 |     config = get_config()
21 | 
22 |     try:
23 |         async with aiohttp.ClientSession() as session:
24 |             async with session.get(config["vllm_api_url"] + "/health") as response:
25 |                 return response.status
26 |     except Exception as e:
27 |         logger.error(f"Error getting health: {e}")
28 |         return 500
29 | 


--------------------------------------------------------------------------------
/infra/cray_infra/training/get_latest_model.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.util.get_config import get_config
 2 | 
 3 | import os
 4 | import json
 5 | 
 6 | 
 7 | def get_latest_model():
 8 |     config = get_config()
 9 | 
10 |     if not os.path.exists(config["training_job_directory"]):
11 |         raise FileNotFoundError("No training jobs found")
12 | 
13 |     # Get the latest model by timestamp
14 |     models = os.listdir(config["training_job_directory"])
15 | 
16 |     if len(models) == 0:
17 |         raise FileNotFoundError("No training jobs found")
18 | 
19 |     models.sort(
20 |         key=lambda x: get_start_time(os.path.join(config["training_job_directory"], x)),
21 |         reverse=True,
22 |     )
23 | 
24 |     model_name = models[0]
25 | 
26 |     return model_name
27 | 
28 | 
29 | def get_start_time(path):
30 |     with open(os.path.join(path, "status.json")) as f:
31 |         status = json.load(f)
32 | 
33 |     if "history" not in status:
34 |         return 0
35 | 
36 |     return status.get("start_time", 0)
37 | 


--------------------------------------------------------------------------------
/infra/cray_infra/training/metrics.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from gpu_aware_mpi import get_rank
 3 | 
 4 | import logging
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | def log_gpu_memory(prefix=""):
 9 |     for i in range(torch.cuda.device_count()):
10 |         free, total = torch.cuda.mem_get_info(i)
11 |         rank = get_rank()
12 |         if rank == 0:
13 |             logger.debug(f"{prefix} GPU {i}: Free={free/1e6:.2f}MB, Total={total/1e6:.2f}MB")
14 | 
15 | def get_model_memory_footprint(model):
16 |     param_size = 0
17 |     for param in model.parameters():
18 |         param_size += param.numel() * param.element_size()
19 |     buffer_size = 0
20 |     for buffer in model.buffers():
21 |         buffer_size += buffer.numel() * buffer.element_size()
22 |     total_size = param_size + buffer_size
23 |     return total_size  # in bytes


--------------------------------------------------------------------------------
/infra/cray_infra/training/squeue.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.api.fastapi.routers.request_types.squeue_response import SqueueResponse
 2 | 
 3 | import subprocess
 4 | 
 5 | 
 6 | async def squeue():
 7 |     try:
 8 |         squeue_output = subprocess.check_output(
 9 |             ["squeue", '--format=%.18i %.9P %.12j %.8u %.8T %.10M %.9l %.6D %R']
10 |         )
11 | 
12 |         return SqueueResponse(
13 |             squeue_output=squeue_output.decode("utf-8"),
14 |         )
15 | 
16 |     except subprocess.CalledProcessError:
17 |         return SqueueResponse(
18 |             error_message="squeue command failed",
19 |         )
20 | 


--------------------------------------------------------------------------------
/infra/cray_infra/training/training_job_status.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | 
4 | class TrainingJobStatus(str, Enum):
5 |     QUEUED = "QUEUED"
6 |     TRAINING = "TRAINING"
7 |     COMPLETED = "COMPLETED"
8 |     FAILED = "FAILED"
9 | 


--------------------------------------------------------------------------------
/infra/cray_infra/training/vllm_model_manager.py:
--------------------------------------------------------------------------------
 1 | class VLLMModelManager:
 2 |     def __init__(self):
 3 |         self._models = []
 4 | 
 5 |     def set_registered_models(self, models):
 6 |         self._models = models
 7 | 
 8 |     def get_registered_models(self):
 9 |         return self._models
10 | 
11 |     def find_model(self, model_name):
12 |         for model in self._models:
13 |             if model_name in model:
14 |                 return model
15 |         return None
16 | 
17 | 
18 | def get_vllm_model_manager():
19 |     """
20 |     Returns a singleton instance of VLLMModelManager.
21 |     """
22 |     if not hasattr(get_vllm_model_manager, "_instance"):
23 |         get_vllm_model_manager._instance = VLLMModelManager()
24 |     return get_vllm_model_manager._instance
25 | 


--------------------------------------------------------------------------------
/infra/cray_infra/util/default_config.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class Config(BaseModel):
 5 |     api_url: str = "http://localhost:8000"
 6 | 
 7 |     model: str = "meta-llama/llama-3.1-8b-instruct"
 8 |     
 9 |     # 10GB using 1024 for KB, 1024 for MB, 1024 for GB
10 |     max_upload_file_size: int = 1024 * 1024 * 1024 * 10
11 | 
12 |     train_job_entrypoint: str = "/app/cray/scripts/train_job_entrypoint.sh"
13 |     training_job_directory: str = "/app/cray/jobs"
14 | 
15 |     max_train_time: int = 15 * 60
16 |     extra_training_seconds: int = 300  # 5 minutes buffer before slurm kills the job
17 | 
18 |     slurm_wait_time: int = 30 # seconds
19 | 
20 |     megatron_refresh_period: int = 30 # seconds
21 | 
22 |     vllm_api_url: str = "http://localhost:8001"
23 | 
24 |     generate_batch_size: int = 1024
25 | 
26 |     response_timeout: int = 60 # seconds
27 |     inference_work_queue_timeout: int = 30 # seconds
28 | 
29 |     inference_work_queue_path: str = "/app/cray/inference_work_queue.sqlite"
30 | 
31 |     gpu_memory_utilization: float = 0.50
32 |     max_model_length: int = 8192
33 |     dtype: str = "bfloat16"
34 | 
35 |     max_log_length: int = 100
36 | 
37 |     server_list: str = "all"
38 | 
39 |     tokenformer_r: int = 32
40 |     tokenformer_num_heads: int = 4
41 | 
42 |     tokenformer_cache_capacity: int = 2
43 | 
44 | 


--------------------------------------------------------------------------------
/infra/cray_infra/util/default_job_config.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from typing import Optional
 4 | 
 5 | 
 6 | class LoraConfig(BaseModel):
 7 |     r: int = 32
 8 |     target_modules: str = "all-linear"
 9 |     use_rslora: bool = True
10 |     modules_to_save: list = ["lm_head"]
11 | 
12 | 
13 | class DiffusionForcingModelConfig(BaseModel):
14 |     num_hidden_layers: int = 2
15 |     num_diffusion_iterations: int = 3
16 |     diffusion_step_size: int = 2
17 |     hidden_size: int = 128
18 |     num_attention_heads: int = 4
19 |     attention_dropout: float = 0.1
20 | 
21 | 
22 | class JobConfig(BaseModel):
23 | 
24 |     job_directory: str
25 |     training_data_path: str
26 |     dataset_hash: str
27 | 
28 |     #llm_name: str = "masint/tiny-random-llama"
29 |     llm_name: str = "meta-llama/Llama-3.2-1B-Instruct"
30 | 
31 |     # Training
32 |     max_steps: int = 100
33 |     learning_rate: float = 3e-3
34 |     batch_size: int = 1
35 |     gradient_clip_value: float = 1.0
36 | 
37 |     max_token_block_size: int = 16777216 # 16 mega tokens
38 | 
39 |     # Checkpointing
40 |     steps_per_checkpoint: int = 100
41 |     max_checkpoints_to_keep: int = 3
42 | 
43 |     gpus: int = 1
44 |     nodes: int = 1
45 | 
46 |     lora_config: Optional[LoraConfig] = LoraConfig()
47 |     diffusion_forcing_config: Optional[DiffusionForcingModelConfig] = (
48 |         DiffusionForcingModelConfig()
49 |     )
50 | 
51 |     # 4 hours in seconds
52 |     timeout: int = 4 * 60 * 60
53 | 
54 |     training_history_length: int = 1024
55 | 
56 | 


--------------------------------------------------------------------------------
/infra/cray_infra/util/get_config.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.util.default_config import Config
 2 | 
 3 | import os
 4 | import yaml
 5 | 
 6 | 
 7 | def get_config():
 8 |     loaded_config = {}
 9 | 
10 |     config_path = "/app/cray/cray-config.yaml"
11 | 
12 |     if os.path.exists(config_path):
13 |         with open(config_path, "r") as stream:
14 |             loaded_config = yaml.safe_load(stream)
15 | 
16 |     return Config(**loaded_config).dict()
17 | 


--------------------------------------------------------------------------------
/infra/cray_infra/util/get_job_config.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.util.default_job_config import JobConfig
 2 | 
 3 | import yaml
 4 | import os
 5 | 
 6 | 
 7 | def get_job_config():
 8 |     job_config_path = get_job_config_path()
 9 | 
10 |     with open(job_config_path, "r") as stream:
11 |         job_config = yaml.safe_load(stream)
12 | 
13 |     # fill in missing values with defaults
14 |     job_config = JobConfig(**job_config).dict()
15 | 
16 |     return job_config
17 | 
18 | 
19 | def get_job_config_path():
20 |     assert (
21 |         "CRAY_TRAINING_JOB_CONFIG_PATH" in os.environ
22 |     ), "CRAY_TRAINING_JOB_CONFIG_PATH not set"
23 |     return os.environ["CRAY_TRAINING_JOB_CONFIG_PATH"]
24 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 2 | 
 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 5 | from vllm.engine.llm_engine import LLMEngine
 6 | from vllm.entrypoints.llm import LLM
 7 | from vllm.executor.ray_utils import initialize_ray_cluster
 8 | from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 9 | from vllm.model_executor.models import ModelRegistry
10 | from vllm.outputs import (
11 |     CompletionOutput,
12 |     EmbeddingOutput,
13 |     EmbeddingRequestOutput,
14 |     RequestOutput,
15 | )
16 | from vllm.pooling_params import PoolingParams
17 | from vllm.sampling_params import SamplingParams
18 | 
19 | from .version import __version__, __version_tuple__
20 | 
21 | __all__ = [
22 |     "__version__",
23 |     "__version_tuple__",
24 |     "LLM",
25 |     "ModelRegistry",
26 |     "PromptType",
27 |     "TextPrompt",
28 |     "TokensPrompt",
29 |     "SamplingParams",
30 |     "RequestOutput",
31 |     "CompletionOutput",
32 |     "EmbeddingOutput",
33 |     "EmbeddingRequestOutput",
34 |     "LLMEngine",
35 |     "EngineArgs",
36 |     "AsyncLLMEngine",
37 |     "AsyncEngineArgs",
38 |     "initialize_ray_cluster",
39 |     "PoolingParams",
40 | ]
41 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/_version.py:
--------------------------------------------------------------------------------
 1 | # file generated by setuptools_scm
 2 | # don't change, don't track in version control
 3 | TYPE_CHECKING = False
 4 | if TYPE_CHECKING:
 5 |     from typing import Tuple, Union
 6 | 
 7 |     VERSION_TUPLE = Tuple[Union[int, str], ...]
 8 | else:
 9 |     VERSION_TUPLE = object
10 | 
11 | version: str
12 | __version__: str
13 | __version_tuple__: VERSION_TUPLE
14 | version_tuple: VERSION_TUPLE
15 | 
16 | __version__ = version = "0.1.dev5+g815064c.d20241108"
17 | __version_tuple__ = version_tuple = (0, 1, "dev5", "g815064c.d20241108")
18 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/adapter_commons/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/adapter_commons/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/adapter_commons/layers.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Tuple
 3 | 
 4 | 
 5 | @dataclass
 6 | class AdapterMapping:
 7 |     # Per every token in input_ids:
 8 |     index_mapping: Tuple[int, ...]
 9 |     # Per sampled token:
10 |     prompt_mapping: Tuple[int, ...]
11 | 
12 |     def __post_init__(self):
13 |         self.index_mapping = tuple(self.index_mapping)
14 |         self.prompt_mapping = tuple(self.prompt_mapping)
15 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/adapter_commons/request.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class AdapterRequest(ABC):
 5 |     """
 6 |     Base class for adapter requests.
 7 |     """
 8 | 
 9 |     @property
10 |     @abstractmethod
11 |     def adapter_id(self) -> int:
12 |         raise NotImplementedError
13 | 
14 |     def __post_init__(self) -> None:
15 |         if self.adapter_id < 1:
16 |             raise ValueError(f"id must be > 0, got {self.adapter_id}")
17 | 
18 |     def __eq__(self, value: object) -> bool:
19 |         return isinstance(value, self.__class__) and self.adapter_id == value.adapter_id
20 | 
21 |     def __hash__(self) -> int:
22 |         return hash(self.adapter_id)
23 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/adapter_commons/worker_manager.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Optional, Set
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbstractWorkerManager(ABC):
 8 | 
 9 |     def __init__(self, device: torch.device):
10 |         self.device = device
11 | 
12 |     @property
13 |     @abstractmethod
14 |     def is_enabled(self) -> bool:
15 |         raise NotImplementedError
16 | 
17 |     @abstractmethod
18 |     def set_active_adapters(self, requests: Set[Any], mapping: Optional[Any]) -> None:
19 |         raise NotImplementedError
20 | 
21 |     @abstractmethod
22 |     def add_adapter(self, adapter_request: Any) -> bool:
23 |         raise NotImplementedError
24 | 
25 |     @abstractmethod
26 |     def remove_adapter(self, adapter_id: int) -> bool:
27 |         raise NotImplementedError
28 | 
29 |     @abstractmethod
30 |     def remove_all_adapters(self) -> None:
31 |         raise NotImplementedError
32 | 
33 |     @abstractmethod
34 |     def list_adapters(self) -> Set[int]:
35 |         raise NotImplementedError
36 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/assets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/assets/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/assets/audio.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Literal, Tuple
 3 | from urllib.parse import urljoin
 4 | 
 5 | import librosa
 6 | import numpy as np
 7 | 
 8 | from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL
 9 | 
10 | ASSET_DIR = "multimodal_asset"
11 | 
12 | 
13 | @dataclass(frozen=True)
14 | class AudioAsset:
15 |     name: Literal["winning_call", "mary_had_lamb"]
16 | 
17 |     @property
18 |     def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
19 | 
20 |         audio_path = get_vllm_public_assets(
21 |             filename=f"{self.name}.ogg", s3_prefix=ASSET_DIR
22 |         )
23 |         y, sr = librosa.load(audio_path, sr=None)
24 |         assert isinstance(sr, int)
25 |         return y, sr
26 | 
27 |     @property
28 |     def url(self) -> str:
29 |         return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
30 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/assets/base.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | from pathlib import Path
 3 | from typing import Optional
 4 | 
 5 | import vllm.envs as envs
 6 | from vllm.connections import global_http_connection
 7 | from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
 8 | 
 9 | vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
10 | 
11 | 
12 | def get_cache_dir() -> Path:
13 |     """Get the path to the cache for storing downloaded assets."""
14 |     path = Path(envs.VLLM_ASSETS_CACHE)
15 |     path.mkdir(parents=True, exist_ok=True)
16 | 
17 |     return path
18 | 
19 | 
20 | @lru_cache
21 | def get_vllm_public_assets(filename: str, s3_prefix: Optional[str] = None) -> Path:
22 |     """
23 |     Download an asset file from ``s3://vllm-public-assets``
24 |     and return the path to the downloaded file.
25 |     """
26 |     asset_directory = get_cache_dir() / "vllm_public_assets"
27 |     asset_directory.mkdir(parents=True, exist_ok=True)
28 | 
29 |     asset_path = asset_directory / filename
30 |     if not asset_path.exists():
31 |         if s3_prefix is not None:
32 |             filename = s3_prefix + "/" + filename
33 |         global_http_connection.download_file(
34 |             f"{vLLM_S3_BUCKET_URL}/{filename}",
35 |             asset_path,
36 |             timeout=VLLM_IMAGE_FETCH_TIMEOUT,
37 |         )
38 | 
39 |     return asset_path
40 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/assets/image.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Literal
 3 | 
 4 | import torch
 5 | from PIL import Image
 6 | 
 7 | from vllm.assets.base import get_vllm_public_assets
 8 | 
 9 | VLM_IMAGES_DIR = "vision_model_images"
10 | 
11 | 
12 | @dataclass(frozen=True)
13 | class ImageAsset:
14 |     name: Literal["stop_sign", "cherry_blossom"]
15 | 
16 |     @property
17 |     def pil_image(self) -> Image.Image:
18 | 
19 |         image_path = get_vllm_public_assets(
20 |             filename=f"{self.name}.jpg", s3_prefix=VLM_IMAGES_DIR
21 |         )
22 |         return Image.open(image_path)
23 | 
24 |     @property
25 |     def image_embeds(self) -> torch.Tensor:
26 |         """
27 |         Image embeddings, only used for testing purposes with llava 1.5.
28 |         """
29 |         image_path = get_vllm_public_assets(
30 |             filename=f"{self.name}.pt", s3_prefix=VLM_IMAGES_DIR
31 |         )
32 |         return torch.load(image_path)
33 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/attention/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.attention.backends.abstract import (
 2 |     AttentionBackend,
 3 |     AttentionMetadata,
 4 |     AttentionMetadataBuilder,
 5 |     AttentionState,
 6 |     AttentionType,
 7 | )
 8 | from vllm.attention.layer import Attention
 9 | from vllm.attention.selector import get_attn_backend
10 | 
11 | __all__ = [
12 |     "Attention",
13 |     "AttentionBackend",
14 |     "AttentionMetadata",
15 |     "AttentionType",
16 |     "AttentionMetadataBuilder",
17 |     "Attention",
18 |     "AttentionState",
19 |     "get_attn_backend",
20 | ]
21 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/attention/backends/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/attention/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/attention/ops/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/attention/ops/blocksparse_attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/attention/ops/blocksparse_attention/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/compilation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/compilation/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/core/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/core/block/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .communication_op import *
2 | from .parallel_state import *
3 | from .utils import *
4 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/distributed/communication_op.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Union
 2 | 
 3 | import torch
 4 | import torch.distributed
 5 | 
 6 | from .parallel_state import get_tp_group
 7 | 
 8 | 
 9 | def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
10 |     """All-reduce the input tensor across model parallel group."""
11 |     return get_tp_group().all_reduce(input_)
12 | 
13 | 
14 | def tensor_model_parallel_all_gather(
15 |     input_: torch.Tensor, dim: int = -1
16 | ) -> torch.Tensor:
17 |     """All-gather the input tensor across model parallel group."""
18 |     return get_tp_group().all_gather(input_, dim)
19 | 
20 | 
21 | def tensor_model_parallel_gather(
22 |     input_: torch.Tensor, dst: int = 0, dim: int = -1
23 | ) -> Optional[torch.Tensor]:
24 |     """Gather the input tensor across model parallel group."""
25 |     return get_tp_group().gather(input_, dst, dim)
26 | 
27 | 
28 | def broadcast_tensor_dict(
29 |     tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0
30 | ):
31 |     if not torch.distributed.is_initialized():
32 |         return tensor_dict
33 |     return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
34 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/distributed/device_communicators/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/engine/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/engine/output_processor/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/engine/output_processor/util.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from typing import Sequence as GenericSequence
 3 | from typing import Union
 4 | 
 5 | from vllm.model_executor.layers.sampler import SamplerOutput
 6 | from vllm.sequence import PoolerOutput, SequenceGroupOutput
 7 | 
 8 | 
 9 | def create_output_by_sequence_group(
10 |     outputs: GenericSequence[Union[SamplerOutput, PoolerOutput]], num_seq_groups: int
11 | ) -> List[List[SequenceGroupOutput]]:
12 |     """Helper method which transforms a 2d list organized by
13 |     [step][sequence group] into [sequence group][step].
14 |     """
15 |     output_by_sequence_group: List[List[SequenceGroupOutput]] = [
16 |         [] for _ in range(num_seq_groups)
17 |     ]
18 |     for step in outputs:
19 |         for i, sequence_group_output in enumerate(step):
20 |             output_by_sequence_group[i].append(sequence_group_output)
21 | 
22 |     return output_by_sequence_group
23 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/entrypoints/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/entrypoints/openai/tool_parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .abstract_tool_parser import ToolParser, ToolParserManager
 2 | from .hermes_tool_parser import Hermes2ProToolParser
 3 | from .internlm2_tool_parser import Internlm2ToolParser
 4 | from .llama_tool_parser import Llama3JsonToolParser
 5 | from .mistral_tool_parser import MistralToolParser
 6 | 
 7 | __all__ = [
 8 |     "ToolParser",
 9 |     "ToolParserManager",
10 |     "Hermes2ProToolParser",
11 |     "MistralToolParser",
12 |     "Internlm2ToolParser",
13 |     "Llama3JsonToolParser",
14 | ]
15 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/executor/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/executor/msgspec_utils.py:
--------------------------------------------------------------------------------
 1 | from array import array
 2 | from typing import Any, Type
 3 | 
 4 | from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
 5 | 
 6 | 
 7 | def encode_hook(obj: Any) -> Any:
 8 |     """Custom msgspec enc hook that supports array types.
 9 | 
10 |     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
11 |     """
12 |     if isinstance(obj, array):
13 |         assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
14 |             f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
15 |             f"Given array has a type code of {obj.typecode}."
16 |         )
17 |         return obj.tobytes()
18 | 
19 | 
20 | def decode_hook(type: Type, obj: Any) -> Any:
21 |     """Custom msgspec dec hook that supports array types.
22 | 
23 |     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
24 |     """
25 |     if type is array:
26 |         deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
27 |         deserialized.frombytes(obj)
28 |         return deserialized
29 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/executor/multiproc_xpu_executor.py:
--------------------------------------------------------------------------------
 1 | import vllm.envs as envs
 2 | from vllm.executor.multiproc_gpu_executor import (
 3 |     MultiprocessingGPUExecutor,
 4 |     MultiprocessingGPUExecutorAsync,
 5 | )
 6 | from vllm.executor.xpu_executor import XPUExecutor
 7 | from vllm.logger import init_logger
 8 | from vllm.utils import make_async
 9 | 
10 | logger = init_logger(__name__)
11 | 
12 | 
13 | class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor):
14 |     """Python multiprocessing-based multi-XPU executor"""
15 | 
16 |     def _check_executor_parameters(self):
17 |         mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
18 |         if mp_method != "spawn":
19 |             raise RuntimeError(
20 |                 "XPU multiprocess executor only support spawn as mp method"
21 |             )
22 | 
23 | 
24 | class MultiprocessingXPUExecutorAsync(
25 |     MultiprocessingXPUExecutor, MultiprocessingGPUExecutorAsync
26 | ):
27 | 
28 |     def __init__(self, *args, **kwargs):
29 |         super().__init__(*args, **kwargs)
30 |         self.driver_exec_model = make_async(self.driver_worker.execute_model)
31 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/executor/ray_xpu_executor.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import List, Optional
 3 | 
 4 | import vllm.envs as envs
 5 | from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync
 6 | from vllm.executor.xpu_executor import XPUExecutor
 7 | from vllm.logger import init_logger
 8 | from vllm.utils import get_vllm_instance_id, make_async
 9 | 
10 | logger = init_logger(__name__)
11 | 
12 | 
13 | class RayXPUExecutor(RayGPUExecutor, XPUExecutor):
14 | 
15 |     def _get_env_vars_to_be_updated(self):
16 |         # Get the set of GPU IDs used on each node.
17 |         worker_node_and_gpu_ids = self._run_workers(
18 |             "get_node_and_gpu_ids", use_dummy_driver=True
19 |         )
20 | 
21 |         VLLM_INSTANCE_ID = get_vllm_instance_id()
22 | 
23 |         # Set environment variables for the driver and workers.
24 |         all_args_to_update_environment_variables = [
25 |             (
26 |                 {
27 |                     "VLLM_INSTANCE_ID": VLLM_INSTANCE_ID,
28 |                     "VLLM_TRACE_FUNCTION": str(envs.VLLM_TRACE_FUNCTION),
29 |                 },
30 |             )
31 |             for (_, _) in worker_node_and_gpu_ids
32 |         ]
33 |         return all_args_to_update_environment_variables
34 | 
35 | 
36 | class RayXPUExecutorAsync(RayXPUExecutor, RayGPUExecutorAsync):
37 | 
38 |     def __init__(self, *args, **kwargs):
39 |         super().__init__(*args, **kwargs)
40 |         self.driver_exec_method = make_async(self.driver_worker.execute_method)
41 |         self.pp_locks: Optional[List[asyncio.Lock]] = None
42 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/forward_context.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from typing import Any
 3 | 
 4 | _forward_context: Any = None
 5 | 
 6 | 
 7 | def get_forward_context() -> Any:
 8 |     """Get the current forward context."""
 9 |     return _forward_context
10 | 
11 | 
12 | @contextmanager
13 | def set_forward_context(context: Any):
14 |     """A context manager that stores the current forward context,
15 |     can be attention metadata, etc."""
16 |     global _forward_context
17 |     prev_context = _forward_context
18 |     _forward_context = context
19 |     try:
20 |         yield
21 |     finally:
22 |         _forward_context = prev_context
23 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/inputs/__init__.py:
--------------------------------------------------------------------------------
 1 | from .data import (
 2 |     EncoderDecoderLLMInputs,
 3 |     ExplicitEncoderDecoderPrompt,
 4 |     LLMInputs,
 5 |     PromptType,
 6 |     SingletonPrompt,
 7 |     TextPrompt,
 8 |     TokensPrompt,
 9 |     build_explicit_enc_dec_prompt,
10 |     to_enc_dec_tuple_list,
11 |     zip_enc_dec_prompts,
12 | )
13 | from .registry import InputContext, InputRegistry
14 | 
15 | INPUT_REGISTRY = InputRegistry()
16 | """
17 | The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
18 | to dispatch data processing according to the target model.
19 | 
20 | See also:
21 |     :ref:`input_processing_pipeline`
22 | """
23 | 
24 | __all__ = [
25 |     "TextPrompt",
26 |     "TokensPrompt",
27 |     "PromptType",
28 |     "SingletonPrompt",
29 |     "ExplicitEncoderDecoderPrompt",
30 |     "LLMInputs",
31 |     "EncoderDecoderLLMInputs",
32 |     "build_explicit_enc_dec_prompt",
33 |     "to_enc_dec_tuple_list",
34 |     "zip_enc_dec_prompts",
35 |     "INPUT_REGISTRY",
36 |     "InputContext",
37 |     "InputRegistry",
38 | ]
39 | 
40 | 
41 | def __getattr__(name: str):
42 |     if name == "PromptInput":
43 |         import warnings
44 | 
45 |         msg = (
46 |             "PromptInput has been renamed to PromptType. "
47 |             "The original name will be removed in an upcoming version."
48 |         )
49 | 
50 |         warnings.warn(DeprecationWarning(msg), stacklevel=2)
51 | 
52 |         return PromptType
53 | 
54 |     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
55 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/logging/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.logging.formatter import NewLineFormatter
2 | 
3 | __all__ = [
4 |     "NewLineFormatter",
5 | ]
6 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/logging/formatter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class NewLineFormatter(logging.Formatter):
 5 |     """Adds logging prefix to newlines to align multi-line messages."""
 6 | 
 7 |     def __init__(self, fmt, datefmt=None, style="%"):
 8 |         logging.Formatter.__init__(self, fmt, datefmt, style)
 9 | 
10 |     def format(self, record):
11 |         msg = logging.Formatter.format(self, record)
12 |         if record.message != "":
13 |             parts = msg.split(record.message)
14 |             msg = msg.replace("\n", "\r\n" + parts[0])
15 |         return msg
16 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/lora/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/lora/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/lora/ops/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.parameter import BasevLLMParameter, PackedvLLMParameter
 2 | from vllm.model_executor.sampling_metadata import (
 3 |     SamplingMetadata,
 4 |     SamplingMetadataCache,
 5 | )
 6 | from vllm.model_executor.utils import set_random_seed
 7 | 
 8 | __all__ = [
 9 |     "SamplingMetadata",
10 |     "SamplingMetadataCache",
11 |     "set_random_seed",
12 |     "BasevLLMParameter",
13 |     "PackedvLLMParameter",
14 | ]
15 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/fused_moe/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.layers.fused_moe.layer import (
 2 |     FusedMoE,
 3 |     FusedMoEMethodBase,
 4 |     FusedMoeWeightScaleSupported,
 5 | )
 6 | from vllm.triton_utils import HAS_TRITON
 7 | 
 8 | __all__ = [
 9 |     "FusedMoE",
10 |     "FusedMoEMethodBase",
11 |     "FusedMoeWeightScaleSupported",
12 | ]
13 | 
14 | if HAS_TRITON:
15 |     from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
16 |         fused_marlin_moe,
17 |         single_marlin_moe,
18 |     )
19 |     from vllm.model_executor.layers.fused_moe.fused_moe import (
20 |         fused_experts,
21 |         fused_moe,
22 |         fused_topk,
23 |         get_config_file_name,
24 |         grouped_topk,
25 |     )
26 | 
27 |     __all__ += [
28 |         "fused_marlin_moe",
29 |         "single_marlin_moe",
30 |         "fused_moe",
31 |         "fused_topk",
32 |         "fused_experts",
33 |         "get_config_file_name",
34 |         "grouped_topk",
35 |     ]
36 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
 1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
 2 | For different settings of
 3 | - E (number of experts)
 4 | - N (intermediate size)
 5 | - device_name (torch.cuda.get_device_name())
 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
 7 | 
 8 | The example configurations provided are for the Mixtral model for TP2 on H100
 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/mamba/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/mamba/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/mamba/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/mamba/ops/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py:
--------------------------------------------------------------------------------
 1 | from .compressed_tensors_scheme import CompressedTensorsScheme
 2 | from .compressed_tensors_w4a16_24 import (
 3 |     W4A16SPARSE24_SUPPORTED_BITS,
 4 |     CompressedTensorsW4A16Sparse24,
 5 | )
 6 | from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
 7 | from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
 8 | from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 9 | from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16
10 | 
11 | __all__ = [
12 |     "CompressedTensorsScheme",
13 |     "CompressedTensorsWNA16",
14 |     "CompressedTensorsW8A16Fp8",
15 |     "CompressedTensorsW4A16Sparse24",
16 |     "CompressedTensorsW8A8Int8",
17 |     "CompressedTensorsW8A8Fp8",
18 |     "WNA16_SUPPORTED_BITS",
19 |     "W4A16SPARSE24_SUPPORTED_BITS",
20 | ]
21 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/quantization/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .layer_utils import replace_parameter, update_tensor_inplace
2 | 
3 | __all__ = ["update_tensor_inplace", "replace_parameter"]
4 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/layers/quantization/utils/machete_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple
 2 | 
 3 | import torch
 4 | 
 5 | from vllm.scalar_type import ScalarType, scalar_types
 6 | 
 7 | MACHETE_SUPPORTED_GROUP_SIZES = [-1, 128]
 8 | MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128]
 9 | 
10 | 
11 | def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]:
12 |     if zero_points:
13 |         return [scalar_types.uint4, scalar_types.uint8]
14 |     else:
15 |         return [scalar_types.uint4b8, scalar_types.uint8b128]
16 | 
17 | 
18 | def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]:
19 |     return [torch.float16, torch.bfloat16]
20 | 
21 | 
22 | def check_machete_supports_shape(
23 |     in_features: int, out_featrues: int
24 | ) -> Tuple[bool, Optional[str]]:
25 |     if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:
26 |         return (
27 |             False,
28 |             "Input features size must be divisible by "
29 |             f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}",
30 |         )
31 |     if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0:
32 |         return (
33 |             False,
34 |             "Output features size must be divisible by "
35 |             f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}",
36 |         )
37 |     return True, None
38 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/model_loader/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from torch import nn
 4 | 
 5 | from vllm.config import (
 6 |     CacheConfig,
 7 |     DeviceConfig,
 8 |     LoadConfig,
 9 |     LoRAConfig,
10 |     ModelConfig,
11 |     ParallelConfig,
12 |     SchedulerConfig,
13 | )
14 | from vllm.model_executor.model_loader.loader import BaseModelLoader, get_model_loader
15 | from vllm.model_executor.model_loader.utils import (
16 |     get_architecture_class_name,
17 |     get_model_architecture,
18 | )
19 | 
20 | 
21 | def get_model(
22 |     *,
23 |     model_config: ModelConfig,
24 |     load_config: LoadConfig,
25 |     device_config: DeviceConfig,
26 |     parallel_config: ParallelConfig,
27 |     scheduler_config: SchedulerConfig,
28 |     lora_config: Optional[LoRAConfig],
29 |     cache_config: CacheConfig
30 | ) -> nn.Module:
31 |     loader = get_model_loader(load_config)
32 |     return loader.load_model(
33 |         model_config=model_config,
34 |         device_config=device_config,
35 |         lora_config=lora_config,
36 |         parallel_config=parallel_config,
37 |         scheduler_config=scheduler_config,
38 |         cache_config=cache_config,
39 |     )
40 | 
41 | 
42 | __all__ = [
43 |     "get_model",
44 |     "get_model_loader",
45 |     "BaseModelLoader",
46 |     "get_architecture_class_name",
47 |     "get_model_architecture",
48 | ]
49 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/model_loader/utils.py:
--------------------------------------------------------------------------------
 1 | """Utilities for selecting and loading models."""
 2 | 
 3 | import contextlib
 4 | from typing import Tuple, Type
 5 | 
 6 | import torch
 7 | from torch import nn
 8 | 
 9 | from vllm.config import ModelConfig
10 | from vllm.model_executor.models import ModelRegistry
11 | 
12 | 
13 | @contextlib.contextmanager
14 | def set_default_torch_dtype(dtype: torch.dtype):
15 |     """Sets the default torch dtype to the given dtype."""
16 |     old_dtype = torch.get_default_dtype()
17 |     torch.set_default_dtype(dtype)
18 |     yield
19 |     torch.set_default_dtype(old_dtype)
20 | 
21 | 
22 | def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
23 |     architectures = getattr(model_config.hf_config, "architectures", [])
24 |     # Special handling for quantized Mixtral.
25 |     # FIXME(woosuk): This is a temporary hack.
26 |     mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"]
27 | 
28 |     if (
29 |         model_config.quantization is not None
30 |         and model_config.quantization not in mixtral_supported
31 |         and "MixtralForCausalLM" in architectures
32 |     ):
33 |         architectures = ["QuantMixtralForCausalLM"]
34 | 
35 |     return ModelRegistry.resolve_model_cls(architectures)
36 | 
37 | 
38 | def get_architecture_class_name(model_config: ModelConfig) -> str:
39 |     return get_model_architecture(model_config)[1]
40 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .interfaces import (
 2 |     HasInnerState,
 3 |     SupportsLoRA,
 4 |     SupportsMultiModal,
 5 |     SupportsPP,
 6 |     has_inner_state,
 7 |     supports_lora,
 8 |     supports_multimodal,
 9 |     supports_pp,
10 | )
11 | from .interfaces_base import (
12 |     VllmModelForEmbedding,
13 |     VllmModelForTextGeneration,
14 |     is_embedding_model,
15 |     is_text_generation_model,
16 | )
17 | from .registry import ModelRegistry
18 | 
19 | __all__ = [
20 |     "ModelRegistry",
21 |     "VllmModelForEmbedding",
22 |     "is_embedding_model",
23 |     "VllmModelForTextGeneration",
24 |     "is_text_generation_model",
25 |     "HasInnerState",
26 |     "has_inner_state",
27 |     "SupportsLoRA",
28 |     "supports_lora",
29 |     "SupportsMultiModal",
30 |     "supports_multimodal",
31 |     "SupportsPP",
32 |     "supports_pp",
33 | ]
34 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/models/phi3.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Adapted from llama.py
 3 | """Inference-only Phi3 model code inherit from Llama.py"""
 4 | 
 5 | from vllm.model_executor.models.llama import LlamaForCausalLM
 6 | 
 7 | 
 8 | class Phi3ForCausalLM(LlamaForCausalLM):
 9 | 
10 |     packed_modules_mapping = {
11 |         "qkv_proj": [
12 |             "qkv_proj",
13 |         ],
14 |         "gate_up_proj": [
15 |             "gate_up_proj",
16 |         ],
17 |     }
18 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/model_executor/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils for model executor."""
 2 | 
 3 | from typing import Any, Dict, Optional
 4 | 
 5 | import torch
 6 | 
 7 | from vllm.utils import seed_everything
 8 | 
 9 | 
10 | def set_random_seed(seed: int) -> None:
11 |     seed_everything(seed)
12 | 
13 | 
14 | def set_weight_attrs(
15 |     weight: torch.Tensor,
16 |     weight_attrs: Optional[Dict[str, Any]],
17 | ):
18 |     """Set attributes on a weight tensor.
19 | 
20 |     This method is used to set attributes on a weight tensor. This method
21 |     will not overwrite existing attributes.
22 | 
23 |     Args:
24 |         weight: The weight tensor.
25 |         weight_attrs: A dictionary of attributes to set on the weight tensor.
26 |     """
27 |     if weight_attrs is None:
28 |         return
29 |     for key, value in weight_attrs.items():
30 |         assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}"
31 |         setattr(weight, key, value)
32 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/multimodal/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import (
 2 |     BatchedTensorInputs,
 3 |     MultiModalDataBuiltins,
 4 |     MultiModalDataDict,
 5 |     MultiModalInputs,
 6 |     MultiModalPlugin,
 7 |     NestedTensors,
 8 | )
 9 | from .registry import MultiModalRegistry
10 | 
11 | MULTIMODAL_REGISTRY = MultiModalRegistry()
12 | """
13 | The global :class:`~MultiModalRegistry` is used by model runners to
14 | dispatch data processing according to its modality and the target model.
15 | 
16 | See also:
17 |     :ref:`input_processing_pipeline`
18 | """
19 | 
20 | __all__ = [
21 |     "BatchedTensorInputs",
22 |     "MultiModalDataBuiltins",
23 |     "MultiModalDataDict",
24 |     "MultiModalInputs",
25 |     "MultiModalPlugin",
26 |     "NestedTensors",
27 |     "MULTIMODAL_REGISTRY",
28 |     "MultiModalRegistry",
29 | ]
30 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/multimodal/audio.py:
--------------------------------------------------------------------------------
 1 | from vllm.inputs.registry import InputContext
 2 | from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin
 3 | 
 4 | 
 5 | class AudioPlugin(MultiModalPlugin):
 6 |     """Plugin for audio data."""
 7 | 
 8 |     def get_data_key(self) -> str:
 9 |         return "audio"
10 | 
11 |     def _default_input_mapper(
12 |         self, ctx: InputContext, data: object, **mm_processor_kwargs
13 |     ) -> MultiModalInputs:
14 |         raise NotImplementedError("There is no default audio input mapper")
15 | 
16 |     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
17 |         raise NotImplementedError("There is no default maximum multimodal tokens")
18 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/platforms/cpu.py:
--------------------------------------------------------------------------------
 1 | import psutil
 2 | import torch
 3 | 
 4 | from .interface import Platform, PlatformEnum
 5 | 
 6 | 
 7 | class CpuPlatform(Platform):
 8 |     _enum = PlatformEnum.CPU
 9 | 
10 |     @classmethod
11 |     def get_device_name(cls, device_id: int = 0) -> str:
12 |         return "cpu"
13 | 
14 |     @classmethod
15 |     def get_device_total_memory(cls, device_id: int = 0) -> int:
16 |         return psutil.virtual_memory().total
17 | 
18 |     @classmethod
19 |     def inference_mode(cls):
20 |         return torch.no_grad()
21 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/platforms/rocm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from functools import lru_cache
 3 | 
 4 | import torch
 5 | 
 6 | from vllm.logger import init_logger
 7 | 
 8 | from .interface import DeviceCapability, Platform, PlatformEnum
 9 | 
10 | logger = init_logger(__name__)
11 | 
12 | if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
13 |     logger.warning(
14 |         "`fork` method is not supported by ROCm. "
15 |         "VLLM_WORKER_MULTIPROC_METHOD is overridden to"
16 |         " `spawn` instead."
17 |     )
18 |     os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
19 | 
20 | 
21 | class RocmPlatform(Platform):
22 |     _enum = PlatformEnum.ROCM
23 | 
24 |     @classmethod
25 |     @lru_cache(maxsize=8)
26 |     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
27 |         major, minor = torch.cuda.get_device_capability(device_id)
28 |         return DeviceCapability(major=major, minor=minor)
29 | 
30 |     @classmethod
31 |     @lru_cache(maxsize=8)
32 |     def get_device_name(cls, device_id: int = 0) -> str:
33 |         return torch.cuda.get_device_name(device_id)
34 | 
35 |     @classmethod
36 |     def get_device_total_memory(cls, device_id: int = 0) -> int:
37 |         device_props = torch.cuda.get_device_properties(device_id)
38 |         return device_props.total_memory
39 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/platforms/tpu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .interface import Platform, PlatformEnum
 4 | 
 5 | 
 6 | class TpuPlatform(Platform):
 7 |     _enum = PlatformEnum.TPU
 8 | 
 9 |     @classmethod
10 |     def get_device_name(cls, device_id: int = 0) -> str:
11 |         raise NotImplementedError
12 | 
13 |     @classmethod
14 |     def get_device_total_memory(cls, device_id: int = 0) -> int:
15 |         raise NotImplementedError
16 | 
17 |     @classmethod
18 |     def inference_mode(cls):
19 |         return torch.no_grad()
20 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/platforms/xpu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .interface import DeviceCapability, Platform, PlatformEnum
 4 | 
 5 | 
 6 | class XPUPlatform(Platform):
 7 |     _enum = PlatformEnum.XPU
 8 | 
 9 |     @staticmethod
10 |     def get_device_capability(device_id: int = 0) -> DeviceCapability:
11 |         major, minor, *_ = torch.xpu.get_device_capability(device_id)["version"].split(
12 |             "."
13 |         )
14 |         return DeviceCapability(major=int(major), minor=int(minor))
15 | 
16 |     @staticmethod
17 |     def get_device_name(device_id: int = 0) -> str:
18 |         return torch.xpu.get_device_name(device_id)
19 | 
20 |     @classmethod
21 |     def get_device_total_memory(cls, device_id: int = 0) -> int:
22 |         device_props = torch.xpu.get_device_properties(device_id)
23 |         return device_props.total_memory
24 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/plugins/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Callable, Optional, Union
 3 | 
 4 | import vllm.envs as envs
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def load_general_plugins():
10 |     """WARNING: plugins can be loaded for multiple times in different
11 |     processes. They should be designed in a way that they can be loaded
12 |     multiple times without causing issues.
13 |     """
14 |     import sys
15 | 
16 |     if sys.version_info < (3, 10):
17 |         from importlib_metadata import entry_points
18 |     else:
19 |         from importlib.metadata import entry_points
20 | 
21 |     allowed_plugins = envs.VLLM_PLUGINS
22 | 
23 |     discovered_plugins = entry_points(group="vllm.general_plugins")
24 |     for plugin in discovered_plugins:
25 |         logger.info("Found general plugin: %s", plugin.name)
26 |         if allowed_plugins is None or plugin.name in allowed_plugins:
27 |             try:
28 |                 func = plugin.load()
29 |                 func()
30 |                 logger.info("Loaded general plugin: %s", plugin.name)
31 |             except Exception:
32 |                 logger.exception("Failed to load general plugin: %s", plugin.name)
33 | 
34 | 
35 | _torch_compile_backend: Optional[Union[Callable, str]] = None
36 | 
37 | 
38 | def set_torch_compile_backend(backend: Union[Callable, str]):
39 |     global _torch_compile_backend
40 |     _torch_compile_backend = backend
41 | 
42 | 
43 | def get_torch_compile_backend() -> Optional[Union[Callable, str]]:
44 |     return _torch_compile_backend
45 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/pooling_params.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | 
 3 | import msgspec
 4 | 
 5 | 
 6 | class PoolingParams(
 7 |     msgspec.Struct, omit_defaults=True, array_like=True  # type: ignore[call-arg]
 8 | ):  # type: ignore[call-arg]
 9 |     """Pooling parameters for pooling.
10 | 
11 |     Attributes:
12 |         additional_data: Any additional data needed for pooling.
13 |     """
14 | 
15 |     additional_data: Optional[Any] = None
16 | 
17 |     def clone(self) -> "PoolingParams":
18 |         """Returns a deep copy of the PoolingParams instance."""
19 |         return PoolingParams(
20 |             additional_data=self.additional_data,
21 |         )
22 | 
23 |     def __repr__(self) -> str:
24 |         return f"PoolingParams(" f"additional_metadata={self.additional_data})"
25 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/prompt_adapter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/prompt_adapter/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/prompt_adapter/request.py:
--------------------------------------------------------------------------------
 1 | import msgspec
 2 | 
 3 | from vllm.adapter_commons.request import AdapterRequest
 4 | 
 5 | 
 6 | class PromptAdapterRequest(
 7 |     msgspec.Struct,
 8 |     array_like=True,  # type: ignore[call-arg]
 9 |     omit_defaults=True,  # type: ignore[call-arg]
10 |     frozen=True,
11 | ):  # type: ignore[call-arg]
12 |     """
13 |     Request for a Prompt adapter.
14 |     """
15 | 
16 |     __metaclass__ = AdapterRequest
17 | 
18 |     prompt_adapter_name: str
19 |     prompt_adapter_id: int
20 |     prompt_adapter_local_path: str
21 |     prompt_adapter_num_virtual_tokens: int
22 | 
23 |     def __hash__(self):
24 |         return super().__hash__()
25 | 
26 |     @property
27 |     def adapter_id(self):
28 |         return self.prompt_adapter_id
29 | 
30 |     @property
31 |     def name(self):
32 |         return self.prompt_adapter_name
33 | 
34 |     @property
35 |     def local_path(self):
36 |         return self.prompt_adapter_local_path
37 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/scalar_type.py:
--------------------------------------------------------------------------------
 1 | from ._core_ext import NanRepr, ScalarType
 2 | 
 3 | # naming generally follows: https://github.com/jax-ml/ml_dtypes
 4 | # for floating point types (leading f) the scheme is:
 5 | #  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
 6 | #  flags:
 7 | #  - no-flags: means it follows IEEE 754 conventions
 8 | #  - f: means finite values only (no infinities)
 9 | #  - n: means nans are supported (non-standard encoding)
10 | # for integer types the scheme is:
11 | #  `[u]int<size_bits>[b<bias>]`
12 | #  - if bias is not present it means its zero
13 | 
14 | 
15 | class scalar_types:
16 |     int4 = ScalarType.int_(4, None)
17 |     uint4 = ScalarType.uint(4, None)
18 |     int8 = ScalarType.int_(8, None)
19 |     uint8 = ScalarType.uint(8, None)
20 |     float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN.value)
21 |     float8_e5m2 = ScalarType.float_IEEE754(5, 2)
22 |     float16_e8m7 = ScalarType.float_IEEE754(8, 7)
23 |     float16_e5m10 = ScalarType.float_IEEE754(5, 10)
24 | 
25 |     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
26 |     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE.value)
27 | 
28 |     # "gptq" types
29 |     uint4b8 = ScalarType.uint(4, 8)
30 |     uint8b128 = ScalarType.uint(8, 128)
31 | 
32 |     # colloquial names
33 |     bfloat16 = float16_e8m7
34 |     float16 = float16_e5m10
35 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/spec_decode/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/tokenformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/tokenformer/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.envs import VLLM_USE_MODELSCOPE
 2 | 
 3 | if VLLM_USE_MODELSCOPE:
 4 |     # Patch here, before each import happens
 5 |     import modelscope
 6 |     from packaging import version
 7 | 
 8 |     # patch_hub begins from modelscope>=1.18.1
 9 |     if version.parse(modelscope.__version__) <= version.parse("1.18.0"):
10 |         raise ImportError(
11 |             "Using vLLM with ModelScope needs modelscope>=1.18.1, please "
12 |             "install by `pip install modelscope>=1.18.1`"
13 |         )
14 | 
15 |     from modelscope.utils.hf_util import patch_hub
16 | 
17 |     # Patch hub to download models from modelscope to speed up.
18 |     patch_hub()
19 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/transformers_utils/configs/mllama.py:
--------------------------------------------------------------------------------
 1 | from transformers.models.mllama import configuration_mllama as mllama_hf_config
 2 | 
 3 | 
 4 | class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
 5 |     """
 6 |     Use this class to override is_encoder_decoder:
 7 |     - transformers regards mllama as is_encoder_decoder=False
 8 |     - vllm needs is_encoder_decoder=True to enable cross-attention
 9 |     """
10 | 
11 |     def __init__(
12 |         self,
13 |         **kwargs,
14 |     ):
15 |         super().__init__(**kwargs)
16 |         self.is_encoder_decoder = True
17 | 
18 | 
19 | class MllamaConfig(mllama_hf_config.MllamaConfig):
20 | 
21 |     def __init__(
22 |         self,
23 |         text_config=None,
24 |         **kwargs,
25 |     ):
26 |         if isinstance(text_config, dict):
27 |             text_config = MllamaTextConfig(**text_config)
28 |         super().__init__(text_config=text_config, **kwargs)
29 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/transformers_utils/configs/nvlm_d.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
 3 | # --------------------------------------------------------
 4 | # NVLM-D
 5 | # Copyright (c) 2024 NVIDIA
 6 | # Licensed under Apache 2.0 License [see LICENSE for details]
 7 | # --------------------------------------------------------
 8 | from .internvl import InternVLChatConfig
 9 | 
10 | 
11 | class NVLM_D_Config(InternVLChatConfig):
12 |     model_type = "NVLM_D"
13 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .mistral import MistralTokenizer
2 | 
3 | __all__ = ["MistralTokenizer"]
4 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/transformers_utils/utils.py:
--------------------------------------------------------------------------------
 1 | from os import PathLike
 2 | from pathlib import Path
 3 | from typing import Union
 4 | 
 5 | 
 6 | def check_gguf_file(model: Union[str, PathLike]) -> bool:
 7 |     """Check if the file is a GGUF model."""
 8 |     model = Path(model)
 9 |     if not model.is_file():
10 |         return False
11 |     elif model.suffix == ".gguf":
12 |         return True
13 | 
14 |     with open(model, "rb") as f:
15 |         header = f.read(4)
16 |     return header == b"GGUF"
17 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/triton_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.triton_utils.importing import HAS_TRITON
 2 | 
 3 | __all__ = ["HAS_TRITON"]
 4 | 
 5 | if HAS_TRITON:
 6 | 
 7 |     from vllm.triton_utils.custom_cache_manager import maybe_set_triton_cache_manager
 8 |     from vllm.triton_utils.libentry import libentry
 9 | 
10 |     __all__ += ["maybe_set_triton_cache_manager", "libentry"]
11 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/triton_utils/importing.py:
--------------------------------------------------------------------------------
 1 | from importlib.util import find_spec
 2 | 
 3 | from vllm.logger import init_logger
 4 | 
 5 | logger = init_logger(__name__)
 6 | 
 7 | HAS_TRITON = find_spec("triton") is not None
 8 | 
 9 | if not HAS_TRITON:
10 |     logger.info(
11 |         "Triton not installed; certain GPU-related functions" " will not be available."
12 |     )
13 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/usage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/usage/__init__.py


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/version.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from ._version import __version__, __version_tuple__
 3 | except Exception as e:
 4 |     import warnings
 5 | 
 6 |     warnings.warn(f"Failed to read commit hash:\n{e}", RuntimeWarning, stacklevel=2)
 7 | 
 8 |     __version__ = "dev"
 9 |     __version_tuple__ = (0, 0, __version__)
10 | 


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/vllm_flash_attn/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/vllm_flash_attn/.gitkeep


--------------------------------------------------------------------------------
/infra/cray_infra/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/cray_infra/vllm/worker/__init__.py


--------------------------------------------------------------------------------
/infra/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | #include "dtype_fp8.cuh"
8 | 


--------------------------------------------------------------------------------
/infra/csrc/attention/dtype_fp8.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "attention_generic.cuh"
 4 | 
 5 | #include <stdint.h>
 6 | #ifdef ENABLE_FP8
 7 |   #ifndef USE_ROCM
 8 |     #include <cuda_fp8.h>
 9 |   #endif  // USE_ROCM
10 | #endif    // ENABLE_FP8
11 | 
12 | namespace vllm {
13 | 
14 | enum class Fp8KVCacheDataType {
15 |   kAuto = 0,
16 |   kFp8E4M3 = 1,
17 |   kFp8E5M2 = 2,
18 | };
19 | 
20 | // fp8 vector types for quantization of kv cache
21 | template <>
22 | struct Vec<uint8_t, 1> {
23 |   using Type = uint8_t;
24 | };
25 | 
26 | template <>
27 | struct Vec<uint8_t, 2> {
28 |   using Type = uint16_t;
29 | };
30 | 
31 | template <>
32 | struct Vec<uint8_t, 4> {
33 |   using Type = uint32_t;
34 | };
35 | 
36 | template <>
37 | struct Vec<uint8_t, 8> {
38 |   using Type = uint2;
39 | };
40 | 
41 | }  // namespace vllm
42 | 


--------------------------------------------------------------------------------
/infra/csrc/cache.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/all.h>
 4 | 
 5 | #include <map>
 6 | #include <vector>
 7 | 
 8 | void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
 9 |                  const torch::Tensor& block_mapping);
10 | 
11 | // Note: the key_caches and value_caches vectors are constant but
12 | // not the Tensors they contain. The vectors need to be const refs
13 | // in order to satisfy pytorch's C++ operator registration code.
14 | void copy_blocks(std::vector<torch::Tensor> const& key_caches,
15 |                  std::vector<torch::Tensor> const& value_caches,
16 |                  const torch::Tensor& block_mapping);
17 | 
18 | void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
19 |                        torch::Tensor& key_cache, torch::Tensor& value_cache,
20 |                        torch::Tensor& slot_mapping,
21 |                        const std::string& kv_cache_dtype, const double k_scale,
22 |                        const double v_scale);
23 | 
24 | void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
25 |                              torch::Tensor& key_cache,
26 |                              torch::Tensor& value_cache,
27 |                              torch::Tensor& slot_mapping,
28 |                              const std::string& kv_cache_dtype,
29 |                              const double k_scale, const double v_scale);
30 | 
31 | // Just for unittest
32 | void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
33 |                  const double scale, const std::string& kv_cache_dtype);
34 | 


--------------------------------------------------------------------------------
/infra/csrc/core/exception.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #define VLLM_IMPLIES(p, q) (!(p) || (q))
4 | 


--------------------------------------------------------------------------------
/infra/csrc/core/registration.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <Python.h>
 4 | 
 5 | #define _CONCAT(A, B) A##B
 6 | #define CONCAT(A, B) _CONCAT(A, B)
 7 | 
 8 | #define _STRINGIFY(A) #A
 9 | #define STRINGIFY(A) _STRINGIFY(A)
10 | 
11 | // A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
12 | // could be a macro instead of a literal token.
13 | #define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
14 | 
15 | // A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
16 | // could be a macro instead of a literal token.
17 | #define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
18 |   TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
19 | 
20 | // REGISTER_EXTENSION allows the shared library to be loaded and initialized
21 | // via python's import statement.
22 | #define REGISTER_EXTENSION(NAME)                                               \
23 |   PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
24 |     static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
25 |                                         STRINGIFY(NAME), nullptr, 0, nullptr}; \
26 |     return PyModule_Create(&module);                                           \
27 |   }
28 | 


--------------------------------------------------------------------------------
/infra/csrc/core/torch_bindings.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/library.h>
 2 | 
 3 | #include "scalar_type.hpp"
 4 | #include "registration.h"
 5 | 
 6 | // Note the CORE exstension will be built for (almost) all hardware targets so
 7 | // new additions must account for this. (currently not built for TPU and Neuron)
 8 | 
 9 | TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) {
10 |   // ScalarType, a custom class for representing data types that supports
11 |   // quantized types, declared here so it can be used when creating interfaces
12 |   // for custom ops.
13 |   vllm::ScalarTypeTorch::bind_class(lib);
14 | }
15 | 
16 | REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
17 | 


--------------------------------------------------------------------------------
/infra/csrc/cpu/cpu_types.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPU_TYPES_HPP
 2 | #define CPU_TYPES_HPP
 3 | 
 4 | #if defined(__x86_64__)
 5 |   //x86 implementation
 6 |   #include "cpu_types_x86.hpp"
 7 | #elif defined(__POWER9_VECTOR__)
 8 |   //ppc implementation
 9 |   #include "cpu_types_vsx.hpp"
10 | #elif defined(__aarch64__)
11 |   //arm implementation
12 |   #include "cpu_types_arm.hpp"
13 | #else
14 |   #warning "unsupported vLLM cpu implementation"
15 | #endif
16 | 
17 | #endif


--------------------------------------------------------------------------------
/infra/csrc/cuda_compat.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef USE_ROCM
 4 |   #include <hip/hip_runtime.h>
 5 | #endif
 6 | 
 7 | #ifndef USE_ROCM
 8 |   #define WARP_SIZE 32
 9 | #else
10 |   #define WARP_SIZE warpSize
11 | #endif
12 | 
13 | #ifndef USE_ROCM
14 |   #define VLLM_LDG(arg) __ldg(arg)
15 | #else
16 |   #define VLLM_LDG(arg) *(arg)
17 | #endif
18 | 
19 | #ifndef USE_ROCM
20 |   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
21 |     __shfl_xor_sync(uint32_t(-1), var, lane_mask)
22 |   #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
23 |     __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
24 | #else
25 |   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
26 |   #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
27 |     __shfl_xor(var, lane_mask, width)
28 | #endif
29 | 
30 | #ifndef USE_ROCM
31 |   #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
32 | #else
33 |   #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
34 | #endif
35 | 
36 | #ifndef USE_ROCM
37 |   #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
38 |     __shfl_down_sync(uint32_t(-1), var, lane_delta)
39 | #else
40 |   #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
41 | #endif
42 | 
43 | #ifndef USE_ROCM
44 |   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
45 |     cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
46 | #else
47 |   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
48 |     hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
49 | #endif
50 | 


--------------------------------------------------------------------------------
/infra/csrc/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 4 |   #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
 5 |   #define DEVICE_INLINE __forceinline__ __device__
 6 |   #define HOST_INLINE __forceinline__ __host__
 7 | #else
 8 |   #define HOST_DEVICE_INLINE inline
 9 |   #define DEVICE_INLINE inline
10 |   #define HOST_INLINE inline
11 | #endif
12 | 
13 | int64_t get_device_attribute(int64_t attribute, int64_t device_id);
14 | 
15 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
16 | 


--------------------------------------------------------------------------------
/infra/csrc/cuda_utils_kernels.cu:
--------------------------------------------------------------------------------
 1 | #ifdef USE_ROCM
 2 |   #include <hip/hip_runtime.h>
 3 |   #include <hip/hip_runtime_api.h>
 4 | #endif
 5 | int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
 6 |   int device, value;
 7 |   if (device_id < 0) {
 8 |     cudaGetDevice(&device);
 9 |   } else {
10 |     device = device_id;
11 |   }
12 |   cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
13 |                          device);
14 |   return value;
15 | }
16 | 
17 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
18 |   int64_t attribute;
19 |   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
20 |   // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
21 | 
22 | #ifdef USE_ROCM
23 |   attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
24 | #else
25 |   attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin;
26 | #endif
27 | 
28 |   return get_device_attribute(attribute, device_id);
29 | }
30 | 


--------------------------------------------------------------------------------
/infra/csrc/dispatch_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from
 3 |  * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
 4 |  */
 5 | #pragma once
 6 | 
 7 | #include <torch/all.h>
 8 | 
 9 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
10 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
11 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
12 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
13 | 
14 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
15 |   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
16 | 
17 | #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
18 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
19 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
20 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
21 |   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
22 | 
23 | #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \
24 |   AT_DISPATCH_SWITCH(TYPE, NAME,                               \
25 |                      VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
26 | 
27 | #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)         \
28 |   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)  \
29 |   AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)  \
30 |   AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
31 |   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
32 |   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
33 | 
34 | #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
35 |   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
36 | 


--------------------------------------------------------------------------------
/infra/csrc/mamba/causal_conv1d/static_switch.h:
--------------------------------------------------------------------------------
 1 | // Inspired by
 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
 3 | // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
 4 | // clang-format off
 5 | // adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/static_switch.h
 6 | 
 7 | #pragma once
 8 | 
 9 | /// @param COND       - a boolean expression to switch by
10 | /// @param CONST_NAME - a name given for the constexpr bool variable.
11 | /// @param ...       - code to execute for true and false
12 | ///
13 | /// Usage:
14 | /// ```
15 | /// BOOL_SWITCH(flag, BoolConst, [&] {
16 | ///     some_function<BoolConst>(...);
17 | /// });
18 | /// ```
19 | #define BOOL_SWITCH(COND, CONST_NAME, ...)                                           \
20 |     [&] {                                                                            \
21 |         if (COND) {                                                                  \
22 |             static constexpr bool CONST_NAME = true;                                 \
23 |             return __VA_ARGS__();                                                    \
24 |         } else {                                                                     \
25 |             static constexpr bool CONST_NAME = false;                                \
26 |             return __VA_ARGS__();                                                    \
27 |         }                                                                            \
28 |     }()
29 | 


--------------------------------------------------------------------------------
/infra/csrc/mamba/mamba_ssm/static_switch.h:
--------------------------------------------------------------------------------
 1 | // Inspired by
 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
 3 | // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
 4 | 
 5 | // clang-format off
 6 | // adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/static_switch.h
 7 | #pragma once
 8 | 
 9 | /// @param COND       - a boolean expression to switch by
10 | /// @param CONST_NAME - a name given for the constexpr bool variable.
11 | /// @param ...       - code to execute for true and false
12 | ///
13 | /// Usage:
14 | /// ```
15 | /// BOOL_SWITCH(flag, BoolConst, [&] {
16 | ///     some_function<BoolConst>(...);
17 | /// });
18 | /// ```
19 | #define BOOL_SWITCH(COND, CONST_NAME, ...) \
20 |   [&] {                                    \
21 |     if (COND) {                            \
22 |       constexpr bool CONST_NAME = true;    \
23 |       return __VA_ARGS__();                \
24 |     } else {                               \
25 |       constexpr bool CONST_NAME = false;   \
26 |       return __VA_ARGS__();                \
27 |     }                                      \
28 |   }()
29 | 


--------------------------------------------------------------------------------
/infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu:
--------------------------------------------------------------------------------
 1 | #include "marlin_moe_kernel_ku4.h"
 2 | 
 3 | namespace marlin_moe {
 4 | 
 5 | // We return bool so we can create these different kernel calls as a sequence
 6 | // of if-elseif's.
 7 | bool call_marlin_moe_kernel_ku4(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks) {
17 |   bool has_zp = true;
18 | 
19 |   if (false) {
20 |   }
21 |   AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256)
22 |   AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256)
23 |   AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128)
24 |   AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128)
25 |   else {
26 |     return false;
27 |   }
28 |   return true;
29 | }
30 | 
31 | }  // namespace marlin_moe
32 | 


--------------------------------------------------------------------------------
/infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "marlin_moe_kernel.h"
 4 | 
 5 | namespace marlin_moe {
 6 | 
 7 | // We return bool so we can create these different kernel calls as a sequence
 8 | // of if-elseif's.
 9 | bool call_marlin_moe_kernel_ku4(
10 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
11 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
12 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
13 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
14 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
15 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
16 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
17 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
18 |     int m_block, int max_par, int cfg_max_m_blocks);
19 | 
20 | }  // namespace marlin_moe
21 | 


--------------------------------------------------------------------------------
/infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu:
--------------------------------------------------------------------------------
 1 | #include "marlin_moe_kernel_ku4b8.h"
 2 | 
 3 | namespace marlin_moe {
 4 | 
 5 | // We return bool so we can create these different kernel calls as a sequence
 6 | // of if-elseif's.
 7 | bool call_marlin_moe_kernel_ku4b8(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks) {
17 |   bool has_zp = false;
18 | 
19 |   if (false) {
20 |   }
21 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
22 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
23 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
24 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
25 |   else {
26 |     return false;
27 |   }
28 |   return true;
29 | }
30 | 
31 | }  // namespace marlin_moe
32 | 


--------------------------------------------------------------------------------
/infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "marlin_moe_kernel.h"
 4 | 
 5 | namespace marlin_moe {
 6 | 
 7 | // We return bool so we can create these different kernel calls as a sequence
 8 | // of if-elseif's.
 9 | bool call_marlin_moe_kernel_ku4b8(
10 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
11 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
12 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
13 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
14 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
15 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
16 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
17 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
18 |     int m_block, int max_par, int cfg_max_m_blocks);
19 | 
20 | }  // namespace marlin_moe
21 | 


--------------------------------------------------------------------------------
/infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu:
--------------------------------------------------------------------------------
 1 | #include "marlin_moe_kernel_ku8b128.h"
 2 | 
 3 | namespace marlin_moe {
 4 | 
 5 | // We return bool so we can create these different kernel calls as a sequence
 6 | // of if-elseif's.
 7 | bool call_marlin_moe_kernel_ku8b128(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks) {
17 |   bool has_zp = false;
18 | 
19 |   if (false) {
20 |   }
21 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
22 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
23 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
24 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
25 |   else {
26 |     return false;
27 |   }
28 |   return true;
29 | }
30 | 
31 | }  // namespace marlin_moe
32 | 


--------------------------------------------------------------------------------
/infra/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "marlin_moe_kernel.h"
 4 | 
 5 | namespace marlin_moe {
 6 | 
 7 | bool call_marlin_moe_kernel_ku8b128(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks);
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/infra/csrc/moe/moe_ops.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <torch/all.h>
4 | 
5 | void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
6 |                   torch::Tensor& token_expert_indices,
7 |                   torch::Tensor& gating_output);
8 | 


--------------------------------------------------------------------------------
/infra/csrc/moe/torch_bindings.cpp:
--------------------------------------------------------------------------------
 1 | #include "core/registration.h"
 2 | #include "moe_ops.h"
 3 | 
 4 | TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 5 |   // Apply topk softmax to the gating outputs.
 6 |   m.def(
 7 |       "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
 8 |       "token_expert_indices, Tensor gating_output) -> ()");
 9 |   m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
10 | 
11 | #ifndef USE_ROCM
12 |   m.def(
13 |       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
14 |       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
15 |       "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
16 |       "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
17 |       "int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
18 |       "int moe_block_size, bool replicate_input, bool apply_weights)"
19 |       " -> Tensor");
20 |   // conditionally compiled so impl registration is in source file
21 | #endif
22 | }
23 | 
24 | REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
25 | 


--------------------------------------------------------------------------------
/infra/csrc/prepare_inputs/advance_step.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/all.h>
 4 | 
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | #include <c10/cuda/CUDAGuard.h>
 7 | #include <cuda.h>
 8 | #include <cuda_fp16.h>
 9 | #include <cuda_runtime.h>
10 | #include <iostream>
11 | 
12 | namespace prepare_inputs {
13 | 
14 | static constexpr int max_threads = 256;
15 | static constexpr bool logging = false;
16 | 
17 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
18 | 
19 | }  // namespace prepare_inputs
20 | 


--------------------------------------------------------------------------------
/infra/csrc/quantization/cutlass_w8a8/common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cutlass/cutlass.h"
 4 | #include <climits>
 5 | 
 6 | /**
 7 |  * Helper function for checking CUTLASS errors
 8 |  */
 9 | #define CUTLASS_CHECK(status)                        \
10 |   {                                                  \
11 |     TORCH_CHECK(status == cutlass::Status::kSuccess, \
12 |                 cutlassGetStatusString(status))      \
13 |   }
14 | 
15 | inline uint32_t next_pow_2(uint32_t const num) {
16 |   if (num <= 1) return num;
17 |   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
18 | }
19 | 
20 | inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
21 |   int max_shared_mem_per_block_opt_in = 0;
22 |   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
23 |                         cudaDevAttrMaxSharedMemoryPerBlockOptin,
24 |                         device);
25 |   return max_shared_mem_per_block_opt_in;
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/infra/csrc/quantization/gptq/qdq_8.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_8_cuh
 6 | #define _qdq_8_cuh
 7 | 
 8 | #include "qdq_util.cuh"
 9 | 
10 | namespace vllm {
11 | namespace gptq {
12 | 
13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
14 | 
15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
16 |                                                const uint32_t q_1,
17 |                                                half2 (&dq)[4], int stride,
18 |                                                const uint32_t zero) {
19 |   half dqh[8];
20 |   for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
21 |   for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
22 | 
23 |   for (int i = 0; i < 4; i++)
24 |     dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
25 | }
26 | 
27 | }  // namespace gptq
28 | }  // namespace vllm
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/infra/csrc/quantization/machete/machete_collective_builder.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cutlass_extensions/vllm_collective_builder.cuh"
 4 | #include "machete_mainloop.cuh"
 5 | 
 6 | namespace cutlass::gemm::collective {
 7 | using namespace cute;
 8 | 
 9 | struct MacheteKernelTag {};
10 | 
11 | template <class ElementPairA_, class GmemLayoutA_, int AlignmentA,
12 |           class ElementPairB_, class GmemLayoutB_, int AlignmentB,
13 |           class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
14 |           class StageCountType, class KernelScheduleType>
15 | struct VLLMCollectiveBuilder<
16 |     MacheteKernelTag, arch::Sm90, arch::OpClassTensorOp, ElementPairA_,
17 |     GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_, AlignmentB,
18 |     ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
19 |     KernelScheduleType,
20 |     cute::enable_if_t<(
21 |         cute::is_same_v<KernelScheduleType,
22 |                         KernelTmaWarpSpecializedMixedInput> ||
23 |         cute::is_same_v<KernelScheduleType,
24 |                         KernelTmaWarpSpecializedPingpongMixedInput> ||
25 |         cute::is_same_v<KernelScheduleType,
26 |                         KernelTmaWarpSpecializedCooperativeMixedInput>)>> {
27 |   using CollectiveOp = machete::MacheteCollectiveMma<
28 |       ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_,
29 |       AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
30 |       StageCountType, KernelScheduleType>;
31 | };
32 | 
33 | };  // namespace cutlass::gemm::collective


--------------------------------------------------------------------------------
/infra/csrc/quantization/machete/machete_interleaving_utils.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cutlass/cutlass.h"
 4 | #include "cute/layout.hpp"
 5 | 
 6 | namespace machete {
 7 | 
 8 | using namespace cute;
 9 | 
10 | // get an interleaved block layout where each element consecutive element has a
11 | // stride of bit_stride and the block width is blk_bit_width,
12 | // examples:
13 | //  size_bits<T> = 8, bit_stride = 8,  blk_bit_width = 32 -> 4:1
14 | //  size_bits<T> = 8, bit_stride = 16, blk_bit_width = 32 -> (2, 2):(2, 1)
15 | //  size_bits<T> = 4, bit_stride = 8,  blk_bit_width = 32 -> (4, 2):(2, 1)
16 | //  size_bits<T> = 4, bit_stride = 16, blk_bit_width = 32 -> (2, 4):(4, 1)
17 | template <typename T, int bit_stride, int blk_bit_width>
18 | CUTE_HOST_DEVICE static constexpr auto get_interleaved_blk_layout() {
19 |   static_assert(blk_bit_width % bit_stride == 0);
20 |   static_assert(bit_stride % cute::sizeof_bits_v<T> == 0);
21 | 
22 |   constexpr auto elems_per_blk = blk_bit_width / cute::sizeof_bits_v<T>;
23 | 
24 |   if constexpr (cute::sizeof_bits_v<T> == bit_stride) {
25 |     // identity layout
26 |     return Layout<Shape<Int<elems_per_blk>>>{};
27 |   } else {
28 |     constexpr auto elems_per_stride = bit_stride / cute::sizeof_bits_v<T>;
29 |     constexpr auto num_strides = elems_per_blk / elems_per_stride;
30 |     return Layout<Shape<Int<num_strides>, Int<elems_per_stride>>,
31 |                   Stride<Int<elems_per_stride>, Int<1>>>{};
32 |   }
33 | }
34 | 
35 | };  // namespace machete
36 | 


--------------------------------------------------------------------------------
/infra/csrc/quantization/marlin/dense/common/base.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Modified by HandH1998
 3 |  * Modified by Neural Magic
 4 |  * Copyright (C) Marlin.2024 Elias Frantar
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *         http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | #pragma once
20 | 
21 | constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
22 | 
23 | // Instances of `Vec` are used to organize groups of >>registers<<, as needed
24 | // for instance as inputs to tensor core operations. Consequently, all
25 | // corresponding index accesses must be compile-time constants, which is why we
26 | // extensively use `#pragma unroll` throughout the kernel code to guarantee
27 | // this.
28 | template <typename T, int n>
29 | struct Vec {
30 |   T elems[n];
31 |   __device__ T& operator[](int i) { return elems[i]; }
32 | };
33 | 


--------------------------------------------------------------------------------
/infra/csrc/rocm/ops.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/all.h>
 4 | 
 5 | void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
 6 |                      torch::Tensor& max_logits, torch::Tensor& tmp_out,
 7 |                      torch::Tensor& query, torch::Tensor& key_cache,
 8 |                      torch::Tensor& value_cache, int64_t num_kv_heads,
 9 |                      double scale, torch::Tensor& block_tables,
10 |                      torch::Tensor& context_lens, int64_t block_size,
11 |                      int64_t max_context_len,
12 |                      const c10::optional<torch::Tensor>& alibi_slopes,
13 |                      const std::string& kv_cache_dtype, double k_scale,
14 |                      double v_scale);
15 | 


--------------------------------------------------------------------------------
/infra/csrc/rocm/torch_bindings.cpp:
--------------------------------------------------------------------------------
 1 | #include "core/registration.h"
 2 | #include "rocm/ops.h"
 3 | 
 4 | // Note on op signatures:
 5 | // The X_meta signatures are for the meta functions corresponding to op X.
 6 | // They must be kept in sync with the signature for X. Generally, only
 7 | // functions that return Tensors require a meta function.
 8 | //
 9 | // See the following links for detailed docs on op registration and function
10 | // schemas.
11 | // https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
12 | // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
13 | 
14 | TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
15 |   // vLLM custom ops for rocm
16 | 
17 |   // Custom attention op
18 |   // Compute the attention between an input query and the cached
19 |   // keys/values using PagedAttention.
20 |   rocm_ops.def(
21 |       "paged_attention(Tensor! out, Tensor exp_sums,"
22 |       "                Tensor max_logits, Tensor tmp_out,"
23 |       "                Tensor query, Tensor key_cache,"
24 |       "                Tensor value_cache, int num_kv_heads,"
25 |       "                float scale, Tensor block_tables,"
26 |       "                Tensor context_lens, int block_size,"
27 |       "                int max_context_len,"
28 |       "                Tensor? alibi_slopes,"
29 |       "                str kv_cache_dtype,"
30 |       "                float k_scale, float v_scale) -> ()");
31 |   rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
32 | }
33 | 
34 | REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
35 | 


--------------------------------------------------------------------------------
/infra/requirements-vllm-build.txt:
--------------------------------------------------------------------------------
1 | setuptools-scm>=8
2 | wheel
3 | cmake>=3.26
4 | ninja
5 | 


--------------------------------------------------------------------------------
/infra/requirements-vllm.txt:
--------------------------------------------------------------------------------
 1 | transformers == 4.48.0  # Required for Llama 3.2.
 2 | peft
 3 | psutil
 4 | typing_extensions >= 4.10
 5 | msgspec
 6 | pydantic >= 2.9  # Required for fastapi >= 0.113.0
 7 | gguf == 0.10.0
 8 | sentencepiece  # Required for LLaMA tokenizer.
 9 | mistral_common[opencv] >= 1.4.4
10 | py-cpuinfo
11 | aiohttp
12 | openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
13 | uvicorn[standard]
14 | fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
15 | fastapi-utils
16 | typing-inspect
17 | pyzmq
18 | cloudpickle
19 | partial-json-parser # used for parsing partial JSON outputs
20 | prometheus_client >= 0.18.0
21 | prometheus-fastapi-instrumentator >= 7.0.0
22 | outlines >= 0.0.43, < 0.1
23 | einops # Required for Qwen2-VL.
24 | protobuf
25 | nvidia-ml-py # for pynvml package
26 | persist-queue
27 | 


--------------------------------------------------------------------------------
/infra/slurm_configs/cgroup.conf:
--------------------------------------------------------------------------------
1 | CgroupPlugin=cgroup/docker
2 | CgroupMountpoint=/tmp/cgroup
3 | ConstrainCores=yes
4 | ConstrainDevices=yes
5 | ConstrainRAMSpace=yes
6 | 


--------------------------------------------------------------------------------
/infra/slurm_configs/gres.conf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/slurm_configs/gres.conf


--------------------------------------------------------------------------------
/infra/slurm_configs/munge.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/slurm_configs/munge.key


--------------------------------------------------------------------------------
/infra/slurm_configs/slurm.conf:
--------------------------------------------------------------------------------
 1 | SlurmctldHost=1d4b95352faa
 2 | SlurmctldPort=6817
 3 | SlurmdPort=6818
 4 | SrunPortRange=7000-7200
 5 | JobRequeue=0
 6 | MpiDefault=none
 7 | ProctrackType=proctrack/linuxproc
 8 | ReturnToService=1
 9 | SlurmctldPidFile=/var/run/slurmctld.pid
10 | SlurmdPidFile=/var/run/slurmd.pid
11 | SlurmdSpoolDir=/var/spool/slurmd
12 | SlurmUser=root
13 | SlurmdUser=root
14 | StateSaveLocation=/var/spool
15 | SwitchType=switch/none
16 | TaskPlugin=task/none
17 | InactiveLimit=0
18 | KillWait=30
19 | MinJobAge=300
20 | SlurmctldTimeout=120
21 | SlurmdTimeout=300
22 | Waittime=0
23 | SchedulerType=sched/backfill
24 | SelectType=select/cons_tres
25 | SelectTypeParameters=CR_Core
26 | AccountingStorageType=accounting_storage/none
27 | AccountingStoreFlags=job_comment
28 | ClusterName=supermas
29 | JobCompType=jobcomp/none
30 | JobAcctGatherFrequency=30
31 | JobAcctGatherType=jobacct_gather/none
32 | SlurmctldDebug=debug
33 | AuthType=auth/none
34 | CredType=cred/none
35 | SlurmctldLogFile=/var/log/slurm/slurmctld.log
36 | SlurmdDebug=debug
37 | SlurmdLogFile=/var/log/slurm/slurmd.log
38 | NodeName=1d4b95352faa CPUs=8  State=UNKNOWN
39 | PartitionName=short Nodes=1d4b95352faa Default=YES MaxTime=20 State=UP
40 | 


--------------------------------------------------------------------------------
/infra/slurm_configs/slurm.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/infra/slurm_configs/slurm.key


--------------------------------------------------------------------------------
/infra/slurm_src/compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Safely execute this bash script
 4 | # e exit on first failure
 5 | # x all executed commands are printed to the terminal
 6 | # u unset variables are errors
 7 | # a export all variables to the environment
 8 | # E any trap on ERR is inherited by shell functions
 9 | # -o pipefail | produces a failure code if any stage fails
10 | set -Eeuoxa pipefail
11 | 
12 | # Get the directory of this script
13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
14 | 
15 | SOURCE_FILE=$LOCAL_DIRECTORY/cgroup_docker.c
16 | INCLUDE_PATH=/usr/include
17 | 
18 | # Compile the cgroup_docker.c file into a shared object file
19 | gcc -I$INCLUDE_PATH -Wall -fPIC -shared -o $LOCAL_DIRECTORY/cgroup_docker.so $SOURCE_FILE
20 | 
21 | # Determine if the target is an x86_64 or aarch64 machine
22 | if [ "$(uname -m)" == "x86_64" ]; then
23 |     TARGET="x86_64-linux-gnu"
24 | elif [ "$(uname -m)" == "aarch64" ]; then
25 |     TARGET="aarch64-linux-gnu"
26 | else
27 |     echo "Unsupported architecture"
28 |     exit 1
29 | fi
30 | 
31 | # Copy the shared object file to the /usr/lib directory
32 | cp /app/cray/infra/slurm_src/cgroup_docker.so /usr/lib/$TARGET/slurm-wlm/cgroup_docker.so
33 | 
34 | # Disable the plugin on the AMD target
35 | if [ $BASE_NAME == "amd" ]; then
36 |     sed -i -e 's/CgroupPlugin=cgroup\/docker/CgroupPlugin=cgroup\/v1/g' /app/cray/infra/slurm_configs/cgroup.conf
37 | fi
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/ml/cray_megatron/collectives/data_parallelism.py:
--------------------------------------------------------------------------------
1 | from gpu_aware_mpi import get_rank, get_size
2 | 
3 | def get_data_parallel_rank():
4 |     return get_rank()
5 | 
6 | 
7 | def get_data_parallel_world_size():
8 |     return get_size()
9 | 


--------------------------------------------------------------------------------
/ml/cray_megatron/collectives/main_rank_only.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from gpu_aware_mpi import get_rank, barrier
 3 | 
 4 | 
 5 | def is_main_rank():
 6 |     return get_rank() == 0
 7 | 
 8 | def main_rank_only(func):
 9 |     def wrap_function(*args, **kwargs):
10 |         result = None
11 |         barrier()
12 |         if is_main_rank():
13 |             result = func(*args, **kwargs)
14 |         barrier()
15 |         return result
16 | 
17 |     return wrap_function
18 | 


--------------------------------------------------------------------------------
/ml/cray_megatron/huggingface/download_model.py:
--------------------------------------------------------------------------------
1 | from cray_megatron.collectives.main_rank_only import main_rank_only
2 | 
3 | from huggingface_hub import snapshot_download
4 | 
5 | 
6 | @main_rank_only
7 | def download_model(model_name):
8 |     snapshot_download(repo_id=model_name)
9 | 


--------------------------------------------------------------------------------
/ml/cray_megatron/megatron/dataset/data_loader.py:
--------------------------------------------------------------------------------
 1 | from cray_megatron.megatron.dataset.load_dataset import load_dataset
 2 | 
 3 | from cray_infra.util.get_job_config import get_job_config
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class DataLoader:
 9 |     def __init__(self, model, tokenizer):
10 | 
11 |         self.model = model
12 |         self.tokenizer = tokenizer
13 |         self.batch_size = get_batch_size()
14 |         self.epoch = 0
15 | 
16 |         self.dataset = load_dataset(
17 |             model=self.model,
18 |             tokenizer=self.tokenizer,
19 |             epoch=self.epoch,
20 |         )
21 | 
22 |         self.loader = torch.utils.data.DataLoader(
23 |             self.dataset, batch_size=self.batch_size
24 |         )
25 | 
26 |     def __iter__(self):
27 |         self.iterator = iter(self.loader)
28 |         return self
29 | 
30 |     def __next__(self):
31 |         try:
32 |             return next(self.iterator)
33 |         except StopIteration:
34 |             self.epoch += 1
35 |             self.dataset = load_dataset(
36 |                 model=self.model,
37 |                 tokenizer=self.tokenizer,
38 |                 epoch=self.epoch,
39 |             )
40 |             self.loader = torch.utils.data.DataLoader(
41 |                 self.dataset, batch_size=self.batch_size
42 |             )
43 |             self.iterator = iter(self.loader)
44 | 
45 |             return next(self.iterator)
46 | 
47 | 
48 | def get_batch_size():
49 |     job_config = get_job_config()
50 |     return job_config["batch_size"]
51 | 


--------------------------------------------------------------------------------
/ml/cray_megatron/megatron/distribution/apply_distribution_strategy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from cray_infra.training.distribution_strategy.fsdp.fsdp import SimpleFSDP
 3 | 
 4 | from gpu_aware_mpi import get_size, get_rank
 5 | 
 6 | def load_distribution_strategy():
 7 |     device = get_device()
 8 | 
 9 |     strategy = {
10 |         "device": device,
11 |     }
12 | 
13 |     if get_size() > 1:
14 |         strategy["strategy"] = SimpleFSDP
15 | 
16 |     return strategy
17 | 
18 | 
19 | def get_device():
20 |     if torch.cuda.is_available():
21 |         rank = get_rank()
22 | 
23 |         gpu_count = torch.cuda.device_count()
24 | 
25 |         selected_gpu = rank % gpu_count
26 | 
27 |         if gpu_count > 1:
28 |             return torch.device(f"cuda:{selected_gpu}")
29 | 
30 |         return torch.cuda.current_device()
31 |     else:
32 |         return torch.device("cpu")
33 | 
34 | 
35 | def apply_distribution_strategy(model_info):
36 |     distribution_strategy = load_distribution_strategy()
37 |     model_info["distribution_strategy"] = distribution_strategy
38 |     return model_info
39 | 


--------------------------------------------------------------------------------
/ml/cray_megatron/megatron/megatron_trainer.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.training.training_harness import TrainingHarness
 2 | from cray_infra.training.training_job_status import TrainingJobStatus
 3 | from cray_infra.training.print_logo import print_logo
 4 | 
 5 | from cray_megatron.megatron.training_loop import TrainingLoop, get_max_steps
 6 | 
 7 | import sys
 8 | 
 9 | import logging
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class MegatronTrainer:
15 |     def __init__(self, training_harness: TrainingHarness):
16 |         self.training_harness = training_harness
17 | 
18 |     def train(self):
19 |         self.train_loop()
20 | 
21 |     def train_loop(self):
22 |         self.training_harness.update_status(
23 |             status=TrainingJobStatus.TRAINING, metadata={"max_steps": get_max_steps()}
24 |         )
25 | 
26 |         print_logo()
27 | 
28 |         TrainingLoop(self.training_harness).train()
29 | 
30 |         self.training_harness.update_status(status=TrainingJobStatus.COMPLETED)
31 | 


--------------------------------------------------------------------------------
/ml/cray_megatron/models/does_any_checkpoint_exist.py:
--------------------------------------------------------------------------------
1 | from cray_megatron.models.get_latest_checkpoint_path import (
2 |     get_latest_checkpoint_path,
3 | )
4 | 
5 | 
6 | def does_any_checkpoint_exist():
7 |     return get_latest_checkpoint_path() is not None
8 | 


--------------------------------------------------------------------------------
/ml/cray_megatron/models/get_model_manager.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from cray_infra.util.get_config import get_config
 3 | 
 4 | from cray_megatron.models.tokenformer.tokenformer_model_manager import TokenformerModelManager
 5 | 
 6 | def get_model_manager():
 7 |     config = get_config()
 8 | 
 9 |     return TokenformerModelManager()
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/ml/cray_megatron/models/model_manager_base.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from cray_megatron.models.get_latest_checkpoint_path import get_latest_checkpoint_path
 3 | from cray_megatron.models.does_any_checkpoint_exist import does_any_checkpoint_exist
 4 | 
 5 | from abc import ABC, abstractmethod
 6 | 
 7 | class ModelManagerBase(ABC):
 8 |     @abstractmethod
 9 |     def load_model(self):
10 |         pass
11 | 
12 |     def does_any_checkpoint_exist(self):
13 |         return does_any_checkpoint_exist()
14 | 
15 |     def get_latest_checkpoint_path(self):
16 |         return get_latest_checkpoint_path()
17 | 
18 | 


--------------------------------------------------------------------------------
/ml/cray_megatron/models/tokenformer/tokenformer_model_manager.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from cray_megatron.models.model_manager_base import ModelManagerBase
 3 | 
 4 | from cray_megatron.models.tokenformer.load_tokenformer_model import load_tokenformer_model
 5 | 
 6 | class TokenformerModelManager(ModelManagerBase):
 7 |     def load_model(self):
 8 |         return load_tokenformer_model()
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy<2.0.0
 2 | jsonlines
 3 | aiofiles
 4 | persist-queue
 5 | matplotlib
 6 | streaming-form-data==1.15.0
 7 | mpi4py==4.0.3
 8 | openmpi==0.0.0
 9 | humanize
10 | 


--------------------------------------------------------------------------------
/scalarlm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Safely execute this bash script
 4 | # e exit on first failure
 5 | # x all executed commands are printed to the terminal
 6 | # u unset variables are errors
 7 | # a export all variables to the environment
 8 | # E any trap on ERR is inherited by shell functions
 9 | # -o pipefail | produces a failure code if any stage fails
10 | set -Eeuoxa pipefail
11 | 
12 | # Get the directory of this script
13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
14 | 
15 | # Refresh the bashly command
16 | $LOCAL_DIRECTORY/cmd/bashly.sh generate
17 | 
18 | # Call the generated CLI script
19 | $LOCAL_DIRECTORY/scripts/scalarlm "$@"
20 | 


--------------------------------------------------------------------------------
/scripts/start_one_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Safely execute this bash script
 4 | # e exit on first failure
 5 | # x all executed commands are printed to the terminal
 6 | # u unset variables are errors
 7 | # a export all variables to the environment
 8 | # E any trap on ERR is inherited by shell functions
 9 | # -o pipefail | produces a failure code if any stage fails
10 | set -Eeuoxa pipefail
11 | 
12 | # Get the directory of this script
13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
14 | 
15 | $LOCAL_DIRECTORY/start_slurm.sh
16 | 
17 | python -m cray_infra.one_server.main
18 | 
19 | 


--------------------------------------------------------------------------------
/scripts/start_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Safely execute this bash script
 4 | # e exit on first failure
 5 | # x all executed commands are printed to the terminal
 6 | # u unset variables are errors
 7 | # a export all variables to the environment
 8 | # E any trap on ERR is inherited by shell functions
 9 | # -o pipefail | produces a failure code if any stage fails
10 | set -Eeuoxa pipefail
11 | 
12 | # Get the directory of this script
13 | LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
14 | 
15 | # Run the slurm discovery service
16 | python $LOCAL_DIRECTORY/../infra/cray_infra/slurm/discovery/discover_clusters.py
17 | 
18 | slurmctld
19 | slurmd
20 | 
21 | 


--------------------------------------------------------------------------------
/scripts/train_job_entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Safely execute this bash script
 4 | # e exit on first failure
 5 | # x all executed commands are printed to the terminal
 6 | # u unset variables are errors
 7 | # a export all variables to the environment
 8 | # E any trap on ERR is inherited by shell functions
 9 | # -o pipefail | produces a failure code if any stage fails
10 | set -Eeuoxa pipefail
11 | 
12 | export CRAY_TRAINING_JOB_CONFIG_PATH=REPLACE_CONFIG_PATH
13 | 
14 | # Get the directory of this script
15 | LOCAL_DIRECTORY="$( cd "$( dirname "${CRAY_TRAINING_JOB_CONFIG_PATH}" )" >/dev/null 2>&1 && pwd )"
16 | 
17 | # Put the current ml directory in the python path so that the modules can be imported
18 | export PYTHONPATH=$LOCAL_DIRECTORY/ml:$PYTHONPATH
19 | 
20 | mpirun --allow-run-as-root --oversubscribe python $LOCAL_DIRECTORY/ml/cray_megatron/main.py $*
21 | 


--------------------------------------------------------------------------------
/sdk/masint/__init__.py:
--------------------------------------------------------------------------------
1 | from masint.api.async_supermassive_intelligence import AsyncSupermassiveIntelligence
2 | from masint.api.supermassive_intelligence import SupermassiveIntelligence
3 | 


--------------------------------------------------------------------------------
/sdk/masint/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/sdk/masint/api/__init__.py


--------------------------------------------------------------------------------
/sdk/masint/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/sdk/masint/cli/__init__.py


--------------------------------------------------------------------------------
/sdk/masint/cli/squeue.py:
--------------------------------------------------------------------------------
 1 | from masint.util.make_api_url import make_api_url
 2 | 
 3 | import aiohttp
 4 | import asyncio
 5 | 
 6 | import logging
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | def squeue():
12 |     logger.info(f"Getting squeue")
13 | 
14 |     try:
15 |         asyncio.run(squeue_async())
16 |     except Exception as e:
17 |         logger.error(f"Failed to get squeue output")
18 |         logger.error(e)
19 | 
20 | 
21 | async def squeue_async():
22 |     async with aiohttp.ClientSession() as session:
23 |         async with session.get(make_api_url(f"v1/megatron/squeue")) as resp:
24 |             data = await resp.json()
25 | 
26 |             logger.info(f"Got response for squeue")
27 |             logger.info(data)
28 | 
29 |             if resp.status != 200:
30 |                 logger.error(f"Failed to get squeue")
31 |                 logger.error(data)
32 |                 raise Exception("Failed to get squeue")
33 | 
34 |             print(data["squeue_output"])
35 | 
36 | 


--------------------------------------------------------------------------------
/sdk/masint/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/sdk/masint/engines/__init__.py


--------------------------------------------------------------------------------
/sdk/masint/engines/cray/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/sdk/masint/engines/cray/__init__.py


--------------------------------------------------------------------------------
/sdk/masint/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/sdk/masint/util/__init__.py


--------------------------------------------------------------------------------
/sdk/masint/util/get_api_base.py:
--------------------------------------------------------------------------------
 1 | import masint
 2 | import scalarlm
 3 | 
 4 | import os
 5 | 
 6 | def get_api_base():
 7 |     if hasattr(scalarlm, "api_url") and scalarlm.api_url is not None:
 8 |         return scalarlm.api_url
 9 | 
10 |     if hasattr(masint, "api_url") and masint.api_url is not None:
11 |         return masint.api_url
12 | 
13 |     if "SCALARLM_API_URL" in os.environ:
14 |         return os.environ["SCALARLM_API_URL"]
15 | 
16 |     if "MASINT_API_URL" in os.environ:
17 |         return os.environ["MASINT_API_URL"]
18 | 
19 |     return "http://localhost:8000"
20 | 
21 | 


--------------------------------------------------------------------------------
/sdk/masint/util/make_api_url.py:
--------------------------------------------------------------------------------
 1 | from masint.util.get_api_base import get_api_base
 2 | 
 3 | 
 4 | def make_api_url(endpoint, api_url=None):
 5 |     if api_url is not None:
 6 |         api_base = api_url
 7 |     else:
 8 |         api_base = get_api_base()
 9 |     return f"{api_base}/{endpoint}"
10 | 


--------------------------------------------------------------------------------
/sdk/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=65.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "scalarlm"
 7 | version = "0.81"
 8 | authors = [
 9 |   { name="Greg Diamos", email="gregory.diamos@gmail.com" },
10 | ]
11 | description = "ScalarLM is a unified LLM inference and training platform"
12 | readme = "README.md"
13 | requires-python = ">=3.7"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: Apache Software License",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "aiohttp",
21 |     "aiofiles",
22 |     "jsonlines",
23 |     "matplotlib",
24 |     "humanize",
25 | ]
26 | 
27 | [tool.setuptools]
28 | packages = [
29 |     "scalarlm",
30 |     "masint",
31 |     "masint.api",
32 |     "masint.cli",
33 |     "masint.util",
34 |     "masint.engines",
35 |     "masint.engines.cray",
36 | ]
37 | 
38 | [project.scripts]
39 | scalarlm = "masint.cli.main:main"
40 | 
41 | [tool.autopep8]
42 | max_line_length = 120
43 | in-place = true
44 | recursive = true
45 | aggressive = 2
46 | 
47 | [project.urls]
48 | Homepage = "https://github.com/tensorwavecloud/scalarlm"
49 | 
50 | 


--------------------------------------------------------------------------------
/sdk/scalarlm/__init__.py:
--------------------------------------------------------------------------------
1 | from masint import *
2 | 


--------------------------------------------------------------------------------
/test/benchmark/main.py:
--------------------------------------------------------------------------------
 1 | from benchmark.pytorch.memcpy import benchmark_memcpy
 2 | from benchmark.pytorch.memcpy_peer import benchmark_memcpy_peer
 3 | from benchmark.pytorch.gemm import benchmark_gemm
 4 | from benchmark.pytorch.forward import benchmark_forward
 5 | from benchmark.pytorch.backward import benchmark_backward
 6 | 
 7 | from benchmark.roofline.plot_roofline import plot_roofline
 8 | from benchmark.roofline.plot_bandwidth_sweep import plot_bandwidth_sweep
 9 | 
10 | import os
11 | 
12 | import logging
13 | 
14 | def main():
15 |     setup_logging()
16 | 
17 |     os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"
18 | 
19 |     benchmark_memcpy()
20 |     benchmark_memcpy_peer()
21 |     benchmark_gemm()
22 |     benchmark_forward()
23 |     benchmark_backward()
24 | 
25 |     plot_roofline()
26 |     plot_bandwidth_sweep()
27 | 
28 | 
29 | def setup_logging():
30 |     logging.basicConfig(level=logging.INFO)
31 | 
32 | main()
33 | 


--------------------------------------------------------------------------------
/test/deployment/embed.py:
--------------------------------------------------------------------------------
 1 | import scalarlm
 2 | 
 3 | 
 4 | scalarlm.api_url = "http://localhost:8000"
 5 | #scalarlm.api_url = "https://meta-llama--llama-3-2-3b-instruct.cray-lm.com"
 6 | #scalarlm.api_url = "https://greg1232--cray-cpu-llama-3-2-1b-instruct-fastapi-app.modal.run"
 7 | #scalarlm.api_url = "https://greg1232--cray-nvidia-llama-3-2-3b-instruct-fastapi-app.modal.run"
 8 | 
 9 | 
10 | def get_dataset(count):
11 |     dataset = []
12 | 
13 |     for i in range(count):
14 |         dataset.append(f"What is {i} + {i}?")
15 | 
16 |     return dataset
17 | 
18 | 
19 | llm = scalarlm.SupermassiveIntelligence()
20 | 
21 | dataset = get_dataset(count=3)
22 | 
23 | results = llm.embed(prompts=dataset,
24 | # generate with default model
25 | # model_name="c7c3ed39e0005e0e73145d49510c94d7b5e4f6552cd35c4a7a8b37d0b41f318e"
26 | )
27 | 
28 | print(results)
29 | 
30 | 


--------------------------------------------------------------------------------
/test/deployment/generate.py:
--------------------------------------------------------------------------------
 1 | import scalarlm
 2 | 
 3 | scalarlm.api_url = "http://localhost:8000"
 4 | 
 5 | def get_dataset(count):
 6 |     dataset = []
 7 | 
 8 |     for i in range(count):
 9 |         dataset.append(f"What is {i} + {i}?")
10 | 
11 |     return dataset
12 | 
13 | 
14 | llm = scalarlm.SupermassiveIntelligence()
15 | 
16 | dataset = get_dataset(count=1)
17 | 
18 | results = llm.generate(
19 |     prompts=dataset,
20 |     max_tokens=200,
21 | )
22 | 
23 | print(results)
24 | 


--------------------------------------------------------------------------------
/test/deployment/health.py:
--------------------------------------------------------------------------------
 1 | import scalarlm
 2 | 
 3 | scalarlm.api_url = "http://localhost:8000"
 4 | 
 5 | llm = scalarlm.SupermassiveIntelligence()
 6 | 
 7 | results = llm.health()
 8 | 
 9 | print(results)
10 | 


--------------------------------------------------------------------------------
/test/deployment/train.py:
--------------------------------------------------------------------------------
 1 | import scalarlm
 2 | 
 3 | scalarlm.api_url = "http://localhost:8000"
 4 | 
 5 | def get_dataset():
 6 |     dataset = []
 7 | 
 8 |     count = 1
 9 | 
10 |     for i in range(count):
11 |         dataset.append({"input": f"What is {i} + {i}?", "output": str(i + i)})
12 | 
13 |     return dataset * 100
14 | 
15 | 
16 | llm = scalarlm.SupermassiveIntelligence(api_url=scalarlm.api_url)
17 | 
18 | dataset = get_dataset()
19 | 
20 | status = llm.train(
21 |     dataset,
22 |     train_args={"max_steps": 100, "learning_rate": 1e-4, "gpus": 2,
23 |             "max_token_block_size": 4096,
24 |             "steps_per_checkpoint": 10000},
25 | )
26 | 
27 | print(status)
28 | 


--------------------------------------------------------------------------------
/test/infra/generate.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.one_server.start_cray_server import start_cray_server
 2 | from cray_infra.one_server.wait_for_vllm import wait_for_vllm
 3 | 
 4 | import masint
 5 | 
 6 | import unittest
 7 | 
 8 | import logging
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class TestGenerate(unittest.IsolatedAsyncioTestCase):
14 |     async def asyncSetUp(self):
15 | 
16 |         logger.info("Starting server")
17 | 
18 |         self.app = await start_cray_server(server_list=["api", "vllm"])
19 | 
20 |         logger.debug(f"Server started: {self.app}")
21 | 
22 |     async def test_generate_single(self):
23 |         logger.debug("Testing generate single")
24 | 
25 |         await wait_for_vllm()
26 | 
27 |         llm = masint.AsyncSupermassiveIntelligence()
28 | 
29 |         result = await llm.generate(prompts=["What is 1 + 1?"])
30 | 
31 |         logger.debug(f"Result: {result}")
32 | 
33 |     async def test_generate_batch(self):
34 |         logger.debug("Testing generate batch")
35 | 
36 |         await wait_for_vllm()
37 | 
38 |         llm = masint.AsyncSupermassiveIntelligence()
39 | 
40 |         prompts = [
41 |             "What is 1 + 1?",
42 |             "What is 2 + 2?",
43 |             "What is 3 + 3?",
44 |             "What is 4 + 4?",
45 |         ]
46 | 
47 |         result = await llm.generate(prompts=prompts)
48 | 
49 |         logger.debug(f"Result: {result}")
50 | 
51 |     async def asyncTearDown(self):
52 |         logger.debug("Shutting down server")
53 |         await self.app.shutdown()
54 | 


--------------------------------------------------------------------------------
/test/infra/get_results.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.one_server.start_cray_server import start_cray_server
 2 | from cray_infra.one_server.wait_for_vllm import wait_for_vllm
 3 | 
 4 | import masint
 5 | 
 6 | import unittest
 7 | 
 8 | import logging
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class TestGetResults(unittest.IsolatedAsyncioTestCase):
14 |     async def asyncSetUp(self):
15 | 
16 |         logger.info("Starting server")
17 | 
18 |         self.app = await start_cray_server(server_list=["api", "vllm"])
19 | 
20 |         logger.debug(f"Server started: {self.app}")
21 | 
22 |     async def test_generate_batch(self):
23 |         logger.debug("Testing generate batch")
24 | 
25 |         await wait_for_vllm()
26 | 
27 |         llm = masint.AsyncSupermassiveIntelligence()
28 | 
29 |         prompts = ["What is 1 + 1?", "What is 2 + 2?", "What is 3 + 3?", "What is 4 + 4?"]
30 | 
31 |         results = await llm.submit_generate(prompts=prompts)
32 | 
33 |         logger.debug(f"Results: {results}")
34 | 
35 |         ids = [r["request_id"] for r in results["results"]]
36 | 
37 |         logger.debug(f"IDs: {ids}")
38 | 
39 |         new_results = await llm.get_results(ids)
40 | 
41 |         logger.debug(f"New Results: {new_results}")
42 | 
43 |         for r in new_results["results"]:
44 |             self.assertTrue(r["request_id"] in ids)
45 | 
46 | 
47 |     async def asyncTearDown(self):
48 |         logger.debug("Shutting down server")
49 |         await self.app.shutdown()
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/test/infra/health.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.one_server.start_cray_server import start_cray_server
 2 | from cray_infra.util.get_config import get_config
 3 | 
 4 | import masint
 5 | 
 6 | import aiohttp
 7 | import unittest
 8 | import pytest
 9 | 
10 | import logging
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | logging.basicConfig(level=logging.DEBUG)
15 | 
16 | 
17 | class TestHealth(unittest.IsolatedAsyncioTestCase):
18 |     async def asyncSetUp(self):
19 | 
20 |         logger.info("Starting server")
21 | 
22 |         self.app = await start_cray_server(server_list=["api"])
23 | 
24 |         logger.debug(f"Server started: {self.app}")
25 | 
26 |     async def test_health(self):
27 |         logger.debug("Testing health endpoint")
28 |         health_status = await get_health()
29 | 
30 |         self.assertEqual(health_status["api"], "up")
31 | 
32 |     async def test_health_client(self):
33 |         logger.debug("Testing health endpoint with client")
34 | 
35 |         llm = masint.AsyncSupermassiveIntelligence()
36 | 
37 |         status = await llm.health()
38 | 
39 |         self.assertEqual(status["api"], "up")
40 | 
41 |     async def asyncTearDown(self):
42 |         logger.debug("Shutting down server")
43 |         await self.app.shutdown()
44 | 
45 | 
46 | async def get_health():
47 |     config = get_config()
48 | 
49 |     async with aiohttp.ClientSession() as session:
50 |         async with session.get(config["api_url"] + "/v1/health") as response:
51 |             return await response.json()
52 | 


--------------------------------------------------------------------------------
/test/infra/openai_client.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.util.get_config import get_config
 2 | 
 3 | from cray_infra.one_server.start_cray_server import start_cray_server
 4 | from cray_infra.one_server.wait_for_vllm import wait_for_vllm
 5 | 
 6 | from openai import AsyncOpenAI
 7 | 
 8 | import unittest
 9 | import asyncio
10 | 
11 | import logging
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | class TestOpenAIClient(unittest.IsolatedAsyncioTestCase):
17 |     async def asyncSetUp(self):
18 | 
19 |         logger.info("Starting server")
20 | 
21 |         self.app = await start_cray_server(server_list=["api", "vllm"])
22 | 
23 |         logger.debug(f"Server started: {self.app}")
24 | 
25 |     async def test_openai_client(self):
26 |         logger.debug("Testing openai client")
27 | 
28 |         await wait_for_vllm()
29 | 
30 |         config = get_config()
31 | 
32 |         client = AsyncOpenAI(
33 |             base_url=config["api_url"] + "/v1/openai",
34 |             api_key="token-abc123",
35 |         )
36 | 
37 |         completion = await client.chat.completions.create(
38 |             model=config["model"],
39 |             messages=[{"role": "user", "content": "Hello!"}],
40 |             max_tokens=10,
41 |         )
42 | 
43 |         print(completion.choices[0].message)
44 | 
45 |     async def asyncTearDown(self):
46 |         logger.debug("Shutting down server")
47 |         await self.app.shutdown()
48 | 


--------------------------------------------------------------------------------
/test/infra/sanity.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | 
4 | class TestSanity(unittest.TestCase):
5 |     def test_sanity(self):
6 |         print("Sanity test")
7 |         self.assertTrue(True)
8 | 


--------------------------------------------------------------------------------
/test/infra/slurm.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import subprocess
 3 | import unittest
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | class TestSlurm(unittest.TestCase):
 9 |     def test_srun(self):
10 |         run_command = [
11 |             "srun",
12 |             "hostname",
13 |         ]
14 |         result = subprocess.run(run_command, stdout=subprocess.PIPE)
15 | 
16 |         logger.debug(f"result: {result}")
17 | 
18 |         self.assertTrue(result.returncode == 0)
19 | 


--------------------------------------------------------------------------------
/test/infra/upload_dataset.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.one_server.start_cray_server import start_cray_server
 2 | 
 3 | import masint
 4 | 
 5 | import unittest
 6 | 
 7 | import logging
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class TestUploadDataset(unittest.IsolatedAsyncioTestCase):
13 |     async def asyncSetUp(self):
14 | 
15 |         logger.info("Starting server")
16 | 
17 |         self.app = await start_cray_server(server_list=["api"])
18 | 
19 |         logger.debug(f"Server started: {self.app}")
20 | 
21 |     async def test_upload_dataset(self):
22 |         logger.debug("Testing upload ability of train endpoint")
23 | 
24 |         llm = masint.AsyncSupermassiveIntelligence()
25 | 
26 |         dataset = get_dataset()
27 | 
28 |         status = await llm.train(dataset, train_args={"max_steps": 1})
29 | 
30 |     async def asyncTearDown(self):
31 |         logger.debug("Shutting down server")
32 |         await self.app.shutdown()
33 | 
34 | 
35 | def get_dataset():
36 |     dataset = []
37 | 
38 |     count = 10000
39 | 
40 |     for i in range(count):
41 |         dataset.append(
42 |             {"input": f"What is {i} + {i}", "output": "The answer is " + str(i + i)}
43 |         )
44 | 
45 |     return dataset
46 | 


--------------------------------------------------------------------------------
/test/infra/vllm_health.py:
--------------------------------------------------------------------------------
 1 | from cray_infra.one_server.start_cray_server import start_cray_server
 2 | from cray_infra.one_server.wait_for_vllm import get_vllm_health, wait_for_vllm
 3 | 
 4 | import unittest
 5 | 
 6 | import logging
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class TestVLLMHealth(unittest.IsolatedAsyncioTestCase):
12 |     async def asyncSetUp(self):
13 | 
14 |         logger.info("Starting server")
15 | 
16 |         self.app = await start_cray_server(server_list=["vllm"])
17 | 
18 |         logger.debug(f"Server started: {self.app}")
19 | 
20 |     async def test_vllm_health(self):
21 |         logger.debug("Testing health endpoint")
22 | 
23 |         await wait_for_vllm()
24 | 
25 |         health_status = await get_vllm_health()
26 | 
27 |         self.assertEqual(health_status, 200)
28 | 
29 |     async def asyncTearDown(self):
30 |         logger.debug("Shutting down server")
31 |         await self.app.shutdown()
32 | 


--------------------------------------------------------------------------------
/test/ml/rl/cs_semester.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorwavecloud/ScalarLM/28a91d970bc195bdf4b48857b506c1c78a171ef5/test/ml/rl/cs_semester.sqlite


--------------------------------------------------------------------------------
/test/ml/tokenformer/test_llama_tokenformer_model.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from transformers import LlamaForCausalLM, LlamaTokenizer
 4 | 
 5 | from ml.tokenformer.llama_tokenformer_model import create_llama_tokenformer_model
 6 | 
 7 | @pytest.fixture
 8 | def model_setup():
 9 |     model_name = "masint/tiny-random-llama"
10 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11 |     tokenizer = LlamaTokenizer.from_pretrained(model_name)
12 |     model = LlamaForCausalLM.from_pretrained(model_name).to(device)
13 |     return model
14 | 
15 | def test_create_llama_tokenformer_model(model_setup):
16 |     model = model_setup
17 |     # lm_head is trained by default
18 |     result = create_llama_tokenformer_model(model, "cpu")
19 | 
20 |     # Check requires_grad is set correctly 
21 |     for name, param in result.named_parameters():
22 |         if any(module_name in name for module_name in ["tokenformer", "lm_head"]):
23 |             assert param.requires_grad
24 |         else:
25 |             assert not param.requires_grad
26 | 
27 | def test_create_llama_tokenformer_model_no_lm_head(model_setup):
28 |     model = model_setup
29 |     # lm_head should not be trained
30 |     result = create_llama_tokenformer_model(model=model, device="cpu", train_lm_head=False)
31 | 
32 |     # Check requires_grad is set correctly 
33 |     for name, param in result.named_parameters():
34 |         if any(module_name in name for module_name in ["tokenformer"]):
35 |             assert param.requires_grad
36 |         else:
37 |             assert not param.requires_grad
38 | 


--------------------------------------------------------------------------------
/test/requirements-pytest.txt:
--------------------------------------------------------------------------------
 1 | pytest
 2 | pytest-cov
 3 | pytest-xdist
 4 | pytest-forked
 5 | pytest-asyncio
 6 | pytest-dotenv
 7 | pytest-mock
 8 | pytest-rerunfailures
 9 | pytest-timeout
10 | codecov
11 | 


--------------------------------------------------------------------------------